In [91]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('../artifacts/mbti_1.csv')

In [5]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


# Data Preprocessing

In [6]:
data.shape

(8675, 2)

In [8]:
data.duplicated().sum()

0

In [10]:
data.isnull().sum()

type     0
posts    0
dtype: int64

### Text Preprocessing

In [11]:
import re
import string

Convert uppercase to lowercase

In [12]:
data["posts"] = data["posts"].apply(lambda x: " ".join(x.lower() for x in x.split()))
data["type"] = data["type"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [18]:
data["posts"].head(5)

0    'http://www.youtube.com/watch?v=qsxhcwe3krw|||...
1    'i'm finding the lack of me in these posts ver...
2    'good one _____ https://www.youtube.com/watch?...
3    'dear intp, i enjoyed our conversation the oth...
4    'you're fired.|||that's another silly misconce...
Name: posts, dtype: object

In [19]:
data["type"].head(5)

0    infj
1    entp
2    intp
3    intj
4    entj
Name: type, dtype: object

Remove Links

In [22]:
data["posts"] = data['posts'].apply(lambda x: " ".join(re.sub(r'https?://\S+', '', x) for x in x.split()))

In [24]:
data["posts"].head(5)

0    ' and intj moments sportscenter not top ten pl...
1    'i'm finding the lack of me in these posts ver...
2    'good one _____ course, to which i say i know;...
3    'dear intp, i enjoyed our conversation the oth...
4    'you're fired.|||that's another silly misconce...
Name: posts, dtype: object

Remove punctuations

In [25]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["posts"] = data["posts"].apply(remove_punctuations)

In [27]:
data["posts"].head(5)

0     and intj moments sportscenter not top ten pla...
1    im finding the lack of me in these posts very ...
2    good one  course to which i say i know thats m...
3    dear intp i enjoyed our conversation the other...
4    youre firedthats another silly misconception t...
Name: posts, dtype: object

Remove numbers


In [29]:
data["posts"] = data['posts'].str.replace('\\d+', '', regex=True)

In [30]:
data["posts"].head(5)

0     and intj moments sportscenter not top ten pla...
1    im finding the lack of me in these posts very ...
2    good one  course to which i say i know thats m...
3    dear intp i enjoyed our conversation the other...
4    youre firedthats another silly misconception t...
Name: posts, dtype: object

Remove Stopwords

In [31]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.16-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ------------------- -------------------- 20.5/42.0 kB ? eta -:--:--
     -------------------------------------  41.0/42.0 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 503.7 kB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 1.3 MB/s eta 0:00:02
   - -------------------------------------- 0.0/1.5 MB 653.6 kB/s eta 0:00:03
   -- ------------------------------------- 0.1/1.5 M

In [32]:
import nltk

In [33]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [34]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [35]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [36]:
data["posts"] = data["posts"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [37]:
data["posts"].head()

0    intj moments sportscenter top ten plays pranks...
1    im finding lack posts alarmingsex boring posit...
2    good one course say know thats blessing cursed...
3    dear intp enjoyed conversation day esoteric ga...
4    youre firedthats another silly misconception a...
Name: posts, dtype: object

In [38]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [39]:
data["posts"] = data["posts"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [40]:
data["posts"].head()

0    intj moment sportscent top ten play prankswhat...
1    im find lack post alarmingsex bore posit often...
2    good one cours say know that bless cursedo abs...
3    dear intp enjoy convers day esoter gab natur u...
4    your firedthat anoth silli misconcept approach...
Name: posts, dtype: object

### Building Vacabulary

In [42]:
from collections import Counter
vocab1 = Counter()
vocab2 = Counter()

In [45]:
for sentence in data['posts']:
    vocab1.update(sentence.split())

In [47]:
len(vocab1)

273519

In [48]:
for sentence in data['type']:
    vocab2.update(sentence.split())

In [50]:
len(vocab2)

16

In [129]:
vocab2

Counter({'infp': 1832,
         'infj': 1470,
         'intp': 1304,
         'intj': 1091,
         'entp': 685,
         'enfp': 675,
         'istp': 337,
         'isfp': 271,
         'entj': 231,
         'istj': 205,
         'enfj': 190,
         'isfj': 166,
         'estp': 89,
         'esfp': 48,
         'esfj': 42,
         'estj': 39})

In [68]:
tokens1 = [key for key in vocab1 if vocab1[key] > 800]

In [69]:
len(tokens1)

1025

In [70]:
tokens2 = [key for key in vocab2 ]

In [112]:
len(tokens2)

16

In [73]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens1, '../static/model/vocabulary1.txt')

In [74]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens2, '../static/model/vocabulary2.txt')

### Divide dataset to train and test the model

In [76]:
x = data["posts"]
y = data["type"]

In [None]:
!pip install scikit-learn

In [81]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [82]:
x_train.shape

(6940,)

In [84]:
x_test.shape

(1735,)

In [132]:
x_train.head()

7894    that experi istp one know high threshold pass ...
6394    like littl social energi anyon wish hold conve...
1263    well consid convers friend month ago cost eat ...
8004    still narrowmind misguid view absolut blow min...
5655    im afraid share know anxiou hard sometim tell ...
Name: posts, dtype: object

In [133]:
y_train.head()

7894    enfp
6394    intp
1263    intp
8004    intp
5655    infj
Name: type, dtype: object

In [86]:
y_test

3117    isfp
7900    infp
1155    isfj
6715    intp
645     intj
        ... 
4629    intp
6104    intp
7781    infp
5502    infj
7335    intj
Name: type, Length: 1735, dtype: object

### Vectoriztion

In [87]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []
    
    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
                
        vectorized_lst.append(sentence_lst)
        
    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)
    
    return vectorized_lst_new

In [92]:
vectorized_x_train = vectorizer(x_train, tokens1)

In [93]:
vectorized_x_test = vectorizer(x_test, tokens1)

In [94]:
vectorized_y_train = vectorizer(y_train, tokens2)

In [113]:
vectorized_y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [95]:
vectorized_y_test = vectorizer(y_test, tokens2)

In [98]:
y_train.value_counts()

type
infp    1465
infj    1167
intp    1040
intj     871
entp     572
enfp     537
istp     273
isfp     223
entj     184
istj     154
enfj     144
isfj     136
estp      73
esfp      37
esfj      36
estj      28
Name: count, dtype: int64

### handle imbalanced dataset

In [99]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
   ---------------------------------------- 0.0/258.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/258.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/258.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/258.0 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/258.0 kB 259.2 kB/s eta 0:00:01
   ------ -------------------------------- 41.0/258.0 kB 245.8 kB/s eta 0:00:01
   --------- ----------------------------- 61.4/258.0 kB 272.3 kB/s eta 0:00:01
   ------------ -------------------------- 81.9/258.0 kB 305.0 kB/s eta 0:00:01
   ------------ -------------------------- 81.9/258.0 kB 305.0 kB/s eta 0:00:01
   ------------ -------------------------- 81.9/258.0 kB 305.0 kB/s eta 0:00:01
   ------------ -------------------------- 81.9/258.0 kB 305.0 kB/s eta

In [100]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, vectorized_y_train_smote = smote.fit_resample(vectorized_x_train, vectorized_y_train)
print(vectorized_x_train_smote.shape, vectorized_y_train_smote.shape)

(23440, 1025) (23440, 16)


## Model Training and Evaluation

In [106]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [107]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

### Logistic Regression 

In [5]:

lr = LogisticRegression()

vectorized_y_train_smote_1d = np.argmax(vectorized_y_train_smote, axis=1)
vectorized_y_train_1d = np.argmax(vectorized_y_train, axis=1)
lr.fit(vectorized_x_train_smote, vectorized_y_train_smote_1d)

y_train_pred = lr.predict(vectorized_x_train_smote)

y_test_pred = lr.predict(vectorized_x_test)



NameError: name 'LogisticRegression' is not defined

In [3]:
y_test_pred

NameError: name 'y_test_pred' is not defined

In [138]:
vectorized_y_test_1d = vectorized_y_train_1d = np.argmax(vectorized_y_test, axis=1)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)