In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../artifacts/mbti_1.csv')


In [3]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
data['type'].head()

0    INFJ
1    ENTP
2    INTP
3    INTJ
4    ENTJ
Name: type, dtype: object

In [5]:
mbti_mapping = {
    'INFJ': 1, 'ENTP': 2, 'INTP': 3, 'INTJ': 4, 'ENTJ': 5, 'ENFJ': 6, 'INFP': 7,
    'ENFP': 8, 'ISFP': 9, 'ISTP': 10, 'ISFJ': 11, 'ISTJ': 12, 'ESTP': 13, 'ESFP': 14,
    'ESTJ': 15, 'ESFJ': 16
}

In [6]:
def replace_mbti_values(data, mapping):
    data['type'] = data['type'].map(mapping)
    return data

In [7]:

data = replace_mbti_values(data, mbti_mapping)

In [8]:
data['type']

0       1
1       2
2       3
3       4
4       5
       ..
8670    9
8671    8
8672    3
8673    7
8674    7
Name: type, Length: 8675, dtype: int64

## Data Preprocessing

In [9]:
data.shape

(8675, 2)

In [10]:
data.duplicated().sum()

0

In [11]:
data.isnull().sum()

type     0
posts    0
dtype: int64

## Text Processing

In [12]:
import re
import string

 Convert uppercase to lowercase

In [13]:
data["posts"] = data["posts"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [14]:
data["posts"].head(5)

0    'http://www.youtube.com/watch?v=qsxhcwe3krw|||...
1    'i'm finding the lack of me in these posts ver...
2    'good one _____ https://www.youtube.com/watch?...
3    'dear intp, i enjoyed our conversation the oth...
4    'you're fired.|||that's another silly misconce...
Name: posts, dtype: object

In [15]:
data["posts"] = data['posts'].apply(lambda x: " ".join(re.sub(r'https?://\S+', '', x) for x in x.split()))

In [16]:
data["posts"].head(5)

0    ' and intj moments  sportscenter not top ten p...
1    'i'm finding the lack of me in these posts ver...
2    'good one _____  course, to which i say i know...
3    'dear intp, i enjoyed our conversation the oth...
4    'you're fired.|||that's another silly misconce...
Name: posts, dtype: object

Remove punctuations

In [17]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["posts"] = data["posts"].apply(remove_punctuations)

In [18]:
data["posts"].head(5)

0     and intj moments  sportscenter not top ten pl...
1    im finding the lack of me in these posts very ...
2    good one   course to which i say i know thats ...
3    dear intp i enjoyed our conversation the other...
4    youre firedthats another silly misconception t...
Name: posts, dtype: object

Remove numbers

In [19]:
data["posts"] = data['posts'].str.replace('\\d+', '', regex=True)

In [20]:
data["posts"].head(5)

0     and intj moments  sportscenter not top ten pl...
1    im finding the lack of me in these posts very ...
2    good one   course to which i say i know thats ...
3    dear intp i enjoyed our conversation the other...
4    youre firedthats another silly misconception t...
Name: posts, dtype: object

Remove Stopwords

In [21]:
import nltk

In [22]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [23]:
data["posts"] = data["posts"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [24]:
data["posts"].head()

0    intj moments sportscenter top ten plays pranks...
1    im finding lack posts alarmingsex boring posit...
2    good one course say know thats blessing cursed...
3    dear intp enjoyed conversation day esoteric ga...
4    youre firedthats another silly misconception a...
Name: posts, dtype: object

Stemming

In [25]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [26]:
data['posts'].shape

(8675,)

In [27]:
data["posts"] = data["posts"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [28]:
data["posts"].head()

0    intj moment sportscent top ten play prankswhat...
1    im find lack post alarmingsex bore posit often...
2    good one cours say know that bless cursedo abs...
3    dear intp enjoy convers day esoter gab natur u...
4    your firedthat anoth silli misconcept approach...
Name: posts, dtype: object

In [29]:
data['posts'].shape

(8675,)

In [30]:
data['type'].shape

(8675,)

### Building Vacabulary

In [31]:
from collections import Counter
vocab = Counter()

In [33]:
for sentence in data['posts']:
    vocab.update(sentence.split())

In [34]:
len(vocab)

273519

In [43]:
tokens = [key for key in vocab if vocab[key] > 2500]

In [44]:
len(tokens)

743

In [45]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

### Divide dataset to train and test the model

In [46]:
data['type']

0       1
1       2
2       3
3       4
4       5
       ..
8670    9
8671    8
8672    3
8673    7
8674    7
Name: type, Length: 8675, dtype: int64

In [47]:
x = data["posts"]
y = data["type"]

In [48]:
x

0       intj moment sportscent top ten play prankswhat...
1       im find lack post alarmingsex bore posit often...
2       good one cours say know that bless cursedo abs...
3       dear intp enjoy convers day esoter gab natur u...
4       your firedthat anoth silli misconcept approach...
                              ...                        
8670    alway think cat fi dom reason websit becom neo...
8671    soif thread alreadi exist someplac els heck de...
8672    mani question thing would take purpl pill pick...
8673    conflict right come want children honestli mat...
8674    long sinc personalitycaf although doesnt seem ...
Name: posts, Length: 8675, dtype: object

In [49]:
y

0       1
1       2
2       3
3       4
4       5
       ..
8670    9
8671    8
8672    3
8673    7
8674    7
Name: type, Length: 8675, dtype: int64

In [50]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [51]:
x_train.shape

(6940,)

In [52]:
x_test.shape

(1735,)

### Vectoriztion

In [53]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []
    
    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
                
        vectorized_lst.append(sentence_lst)
        
    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)
    
    return vectorized_lst_new

In [54]:
vectorized_x_train = vectorizer(x_train, tokens)

In [55]:
tokens

['intj',
 'moment',
 'top',
 'play',
 'experi',
 'life',
 'perc',
 'last',
 'thing',
 'infj',
 'friend',
 'post',
 'next',
 'day',
 'rest',
 'enfj',
 'sorri',
 'hear',
 'natur',
 'relationship',
 'perfect',
 'time',
 'everi',
 'exist',
 'tri',
 'figur',
 'hard',
 'welcom',
 'stuff',
 'game',
 'set',
 'least',
 'minut',
 'move',
 'dont',
 'mean',
 'sit',
 'mayb',
 'come',
 'three',
 'youv',
 'type',
 'want',
 'would',
 'like',
 'use',
 'given',
 'cognit',
 'function',
 'left',
 'video',
 'good',
 'one',
 'note',
 'subject',
 'complet',
 'death',
 'enfp',
 'favorit',
 'grow',
 'current',
 'cool',
 'appear',
 'late',
 'someon',
 'thought',
 'confid',
 'within',
 'world',
 'id',
 'enjoy',
 'worri',
 'peopl',
 'alway',
 'around',
 'entp',
 'your',
 'main',
 'social',
 'live',
 'convers',
 'even',
 'realli',
 'part',
 'thread',
 'high',
 'eat',
 'someth',
 'follow',
 'mani',
 'could',
 'think',
 'watch',
 'movi',
 'class',
 'noth',
 'whole',
 'reason',
 'two',
 'right',
 'middl',
 'today',
 

In [56]:
vectorized_x_train

array([[0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 1., 0., 1.]], dtype=float32)

In [57]:
vectorized_x_test = vectorizer(x_test, tokens)

In [58]:
y_test

5543     3
5339     4
4241     7
8661     2
1689     8
        ..
2221     4
4583     4
1234     2
5434    10
4142     4
Name: type, Length: 1735, dtype: int64

In [59]:
vectorized_x_test.shape

(1735, 743)

In [60]:
y_train.value_counts()

type
7     1478
1     1179
3     1039
4      859
2      557
8      525
10     277
9      216
5      184
12     157
6      157
11     135
13      73
14      36
15      34
16      34
Name: count, dtype: int64

### handle imbalanced dataset

In [61]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(23648, 743) (23648,)


In [62]:
y_train_smote.value_counts()

type
7     1478
1     1478
4     1478
8     1478
2     1478
3     1478
11    1478
12    1478
6     1478
10    1478
15    1478
13    1478
9     1478
5     1478
16    1478
14    1478
Name: count, dtype: int64

## Model Training and Evaluation

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [64]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred, average='macro'), 3)
    rec = round(recall_score(y_act, y_pred, average='macro'), 3)
    f1 = round(f1_score(y_act, y_pred, average='macro'), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred, average='macro', zero_division=1), 3)
    rec = round(recall_score(y_act, y_pred, average='macro'), 3)
    f1 = round(f1_score(y_act, y_pred, average='macro'), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

### Logistic Regression 

In [65]:
y_train_smote.shape


(23648,)

In [68]:
lr = LogisticRegression(max_iter=1000)
lr.fit(vectorized_x_train_smote, y_train_smote)
actual_iterations_lr = lr.n_iter_[0]
max_iter_lr = actual_iterations_lr + 100
lr = LogisticRegression(max_iter=max_iter_lr)
lr.fit(vectorized_x_train_smote, y_train_smote)
y_train_pred_lr = lr.predict(vectorized_x_train_smote)
y_test_pred_lr = lr.predict(vectorized_x_test)
print("Logistic Regression:")
training_scores(y_train_smote, y_train_pred_lr)
validation_scores(y_test, y_test_pred_lr)

Logistic Regression:
Training Scores:
	Accuracy = 0.957
	Precision = 0.956
	Recall = 0.957
	F1-Score = 0.957
Testing Scores:
	Accuracy = 0.428
	Precision = 0.285
	Recall = 0.295
	F1-Score = 0.286


## Naive Bayes


In [58]:
nb = MultinomialNB()
nb.fit(vectorized_x_train_smote, y_train_smote)
y_train_pred_nb = nb.predict(vectorized_x_train_smote)
y_test_pred_nb = nb.predict(vectorized_x_test)
print("\nMultinomial Naive Bayes:")
training_scores(y_train_smote, y_train_pred_nb)
validation_scores(y_test, y_test_pred_nb)


Multinomial Naive Bayes:
Training Scores:
	Accuracy = 0.731
	Precision = 0.755
	Recall = 0.731
	F1-Score = 0.74
Testing Scores:
	Accuracy = 0.367
	Precision = 0.198
	Recall = 0.204
	F1-Score = 0.199


In [70]:
rf = RandomForestClassifier()
rf.fit(vectorized_x_train_smote, y_train_smote)
y_train_pred_rf = rf.predict(vectorized_x_train_smote)
y_test_pred_rf = rf.predict(vectorized_x_test)
print("\nRandom Forest:")
training_scores(y_train_smote, y_train_pred_rf)
validation_scores(y_test, y_test_pred_rf)


Random Forest:
Training Scores:
	Accuracy = 1.0
	Precision = 1.0
	Recall = 1.0
	F1-Score = 1.0
Testing Scores:
	Accuracy = 0.417
	Precision = 0.618
	Recall = 0.186
	F1-Score = 0.187


## Support Vector Machine

In [69]:
svm = SVC()
svm.fit(vectorized_x_train_smote, y_train_smote)
y_train_pred_svm = svm.predict(vectorized_x_train_smote)
y_test_pred_svm = svm.predict(vectorized_x_test)
print("\nSupport Vector Machine (SVM):")
training_scores(y_train_smote, y_train_pred_svm)
validation_scores(y_test, y_test_pred_svm)


Support Vector Machine (SVM):
Training Scores:
	Accuracy = 0.997
	Precision = 0.997
	Recall = 0.997
	F1-Score = 0.997
Testing Scores:
	Accuracy = 0.497
	Precision = 0.7
	Recall = 0.234
	F1-Score = 0.236


In [71]:
import pickle

with open('../static/model/model.pickle', 'wb') as file:
    pickle.dump(svm, file)