## Simple Models 

### Import Packages and cleaned data 

In [1]:
# regular package 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import os


# nltk 
import nltk 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Doylism/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# import cleaned data
df = pd.read_csv('../../clean_data/clean_comments.csv', index_col=0)

  mask |= (ar1 == a)


In [7]:
df.head(20)

Unnamed: 0,MBTI,comments
0,INFP,lol thats left
1,INTP,post try telling people time im always joking ...
2,INFP,first thought pepsi something probably alcohol
3,ENTP,formula something like every time says add bpm...
4,INTP,imply im five
5,INTP,well wouldnt know think theres lot potential t...
6,INFJ,sine na support directors actors people behind...
7,INFP,use enough vacation days dont lose time rolls
8,INTP,ur angle youre devil
9,INTP,mean dont much influence crow ruby


In [10]:
df.isna().sum()

MBTI            0
comments    23971
dtype: int64

In [11]:
# drop the null values
df = df.dropna()

In [12]:
df.isna().sum()

MBTI        0
comments    0
dtype: int64

In [13]:
df.shape
# now we are left with 2.97 million of comments 

(2976811, 2)

### Simple Models 
#### Split the data

In [14]:
X = df['comments']
y = df['MBTI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [17]:
mbti = ['INFP','INFJ','INTP','INTJ','ENTP','ENFP','ISTP','ISFP','ENTJ','ISTJ','ENFJ','ISFJ','ESTP','ESFP','ESFJ','ESTJ']

#### Logistic Regression

In [15]:
my_logit = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression())
])
my_logit.fit(X_train, y_train)



KeyboardInterrupt: 

In [None]:
y_pred = my_logit.predict(X_test)

print('accuracy %s' % my_logit.score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=mbti))

#### Naive Bayes 

In [16]:
# build a pipeline for vectorise, transform and clasify the data 
my_nb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])
my_nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [18]:
y_pred = my_nb.predict(X_test)

print('accuracy %s' % my_nb.score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=mbti))

accuracy 0.1707676217521197
              precision    recall  f1-score   support

        INFP       0.59      0.06      0.11     54044
        INFJ       0.45      0.04      0.08     61673
        INTP       0.24      0.15      0.18     76574
        INTJ       0.35      0.06      0.10     67098
        ENTP       0.95      0.01      0.02     18187
        ENFP       0.79      0.01      0.01     19948
        ISTP       0.76      0.03      0.06     28917
        ISFP       0.63      0.09      0.15     48215
        ENTJ       0.17      0.22      0.19     81090
        ISTJ       0.13      0.58      0.21     92517
        ENFJ       0.53      0.02      0.05     52415
        ISFJ       0.14      0.40      0.21     89301
        ESTP       0.63      0.05      0.09     46995
        ESFP       0.71      0.07      0.14     42366
        ESFJ       0.60      0.03      0.05     46371
        ESTJ       0.45      0.17      0.25     67333

   micro avg       0.17      0.17      0.17    89304

#### LInear SVM 

In [19]:
sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2', \
                                random_state=42, max_iter=5, tol=None))
])

sgd.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [20]:
y_pred = sgd.predict(X_test)

print('accuracy %s' % sgd.score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=mbti))

accuracy 0.16417220204155675
              precision    recall  f1-score   support

        INFP       0.19      0.24      0.21     54044
        INFJ       0.18      0.14      0.16     61673
        INTP       0.18      0.18      0.18     76574
        INTJ       0.16      0.16      0.16     67098
        ENTP       0.22      0.20      0.21     18187
        ENFP       0.20      0.25      0.22     19948
        ISTP       0.26      0.32      0.28     28917
        ISFP       0.24      0.27      0.25     48215
        ENTJ       0.17      0.17      0.17     81090
        ISTJ       0.17      0.12      0.14     92517
        ENFJ       0.15      0.13      0.14     52415
        ISFJ       0.16      0.15      0.15     89301
        ESTP       0.19      0.19      0.19     46995
        ESFP       0.23      0.26      0.24     42366
        ESFJ       0.18      0.18      0.18     46371
        ESTJ       0.25      0.29      0.27     67333

   micro avg       0.19      0.19      0.19    8930