# MBTI Project


To do:
1- Make a combined notebook that is clean and easy to follow 
2- Add background information 
3- Visualize raw data (better plots than what have right now)
4- Add parts about EDA (research to support it?)
5- Compare the models visually 
6- What's the best model and why?
7- Where can we go from here?

In [3]:
reset -fs

In [27]:
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string 
import re
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
plt.style.use('fivethirtyeight')

## General info:
#### Personality Types:

In [7]:
# save time, run this to load the clean post
processed_post = pd.read_csv('data/mbti_preprocessed_1.csv')
processed_post.drop('Unnamed: 0',axis=1,inplace=True)
processed_post.head()

Unnamed: 0,type,posts,processed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play pr...
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring positi...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric ...
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approachi...


### Vectorize the posts

Create a bag of words representation of each user by using tfidf

In [8]:
# TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(min_df=0.05, max_df=0.85, analyzer='word', ngram_range=(1, 2))
word_tfidf = vectorizer_tfidf.fit_transform(processed_post['processed_posts'])
word_tfidf_df = pd.DataFrame(data = word_tfidf.toarray(), columns = vectorizer_tfidf.get_feature_names())
# CountVectorizer
vectorizer_ct = CountVectorizer(stop_words='english',analyzer='word',input='content', 
                                 decode_error='ignore', max_df=0.48,min_df=5,
                                 token_pattern=r'\w{1,}', max_features=1625, ngram_range=(1,2)) # to compare two methods, I limit max_features=1625
word_ct = vectorizer_ct.fit_transform(processed_post['processed_posts'])
word_ct_df = pd.DataFrame(data = word_ct.toarray(), columns = vectorizer_ct.get_feature_names())

In [11]:
# word_tfidf_df.head()

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,across,...,year ago,year old,yep,yes,yesterday,yet,young,younger,youtube,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067997,0.0,0.083075,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.038307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.12246,0.0444,0.0,0.106856,0.0,0.0,0.0,0.0,0.064077,0.0,...,0.0,0.063801,0.0,0.060355,0.0,0.0,0.0,0.0,0.0,0.081823
3,0.0,0.071834,0.066683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.059121,0.0,0.055929,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
mbti = pd.read_csv("data/mbti_FE.csv")

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [11]:
def model(model, X, target, nsplits=4):
    kf = StratifiedShuffleSplit(n_splits=nsplits, random_state=420)
    
    types = {'EorI':'Extroversion vs. Introversion', 'NorS': 'Intuition vs. Sensing',
                 'TorF': 'Thinking vs. Feeling','JorP': 'Judging vs. Perceiving'}
    t = time.time()
    for col in target.columns:
        print(f"{types[col]}:")
        y = target[col]
        all_auc = []
        # all_accuracies = []
        f_score = []
        for train, test in kf.split(X,y):
            X_train, X_test, y_train, y_test = X.loc[train], X.loc[test], y[train], y[test]
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            # get the probability of prediction for auc score
            preds_act = model.predict_proba(X_test)[:,1]
            
            # preds = model.predict(X_test)
            auc = roc_auc_score(y_test, preds_act)
            all_auc.append(auc)
                    
            fscore = f1_score(preds,y_test)
            f_score.append(fscore)
            model_name = str(model).split('(')[0]
        print(f'Average AUC: {np.mean(all_auc):.3f}; Average fscore: {np.mean(f_score):.3f}')
    print(f"Time use:{time.time()-t:.3f}s")
    

In [40]:
# vectorizers and classifiers
X_tf = pd.concat([mbti.iloc[:,6:],word_tfidf_df],axis=1)
X_ct = pd.concat([mbti.iloc[:,6:],word_ct_df],axis=1)

# Bagging Classifier

### Using TFIDF

In [19]:
bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors=10),
                            max_samples=0.5, max_features=0.5)
model(bagging, X_tf, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.579; Average fscore: 0.017
Intuition vs. Sensing:
Average AUC: 0.539; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.680; Average fscore: 0.568
Judging vs. Perceiving:
Average AUC: 0.501; Average fscore: 0.736
Time use:163.062s


### Using Counter

In [17]:
model(bagging, X_ct, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.576; Average fscore: 0.045
Intuition vs. Sensing:
Average AUC: 0.567; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.687; Average fscore: 0.565
Judging vs. Perceiving:
Average AUC: 0.559; Average fscore: 0.719
Time use:550.821s


# Logistic Regression

### Using TFIDF
Works better than everything else!

In [26]:
Logistic = LogisticRegression(random_state=0)
model(Logistic, X_tf, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.746; Average fscore: 0.261
Intuition vs. Sensing:
Average AUC: 0.746; Average fscore: 0.044
Thinking vs. Feeling:
Average AUC: 0.863; Average fscore: 0.760
Judging vs. Perceiving:
Average AUC: 0.712; Average fscore: 0.760
Time use:26.775s


### Using Count

In [25]:
model(Logistic, X_ct, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.699; Average fscore: 0.424
Intuition vs. Sensing:
Average AUC: 0.680; Average fscore: 0.296
Thinking vs. Feeling:
Average AUC: 0.806; Average fscore: 0.712
Judging vs. Perceiving:
Average AUC: 0.676; Average fscore: 0.718
Time use:40.883s


# SVM

In [98]:
svm = SVC(gamma='auto')
model(svm, X_ct, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.538; Average fscore: 0.216
Intuition vs. Sensing:
Average AUC: 0.530; Average fscore: 0.099
Thinking vs. Feeling:
Average AUC: 0.601; Average fscore: 0.546
Judging vs. Perceiving:
Average AUC: 0.504; Average fscore: 0.633
Time use:185.176s


# Random Forest 

In [51]:
random_forest = RandomForestClassifier(random_state=0, n_estimators=300, max_depth=7)
model(random_forest, X_tf, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.718; Average fscore: 0.000
Intuition vs. Sensing:
Average AUC: 0.681; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.825; Average fscore: 0.661
Judging vs. Perceiving:
Average AUC: 0.671; Average fscore: 0.753
Time use:133.140s


In [49]:
random_forest = RandomForestClassifier(random_state=0, n_estimators=300, max_depth=5)
model(random_forest, X_tf, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.710; Average fscore: 0.000
Intuition vs. Sensing:
Average AUC: 0.673; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.821; Average fscore: 0.617
Judging vs. Perceiving:
Average AUC: 0.674; Average fscore: 0.753
Time use:97.837s


In [47]:
model(random_forest, X_ct, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.703; Average fscore: 0.000
Intuition vs. Sensing:
Average AUC: 0.679; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.807; Average fscore: 0.407
Judging vs. Perceiving:
Average AUC: 0.652; Average fscore: 0.753
Time use:25.282s
