# MBTI Project

By Nan Lin

In [3]:
reset -fs

In [4]:
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string 
import re
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.ensemble import BaggingClassifier
plt.style.use('fivethirtyeight')

### Model Tuning 

-  EDA: 
    What we can tell from the violinplot? Should we remove these two plots from our EDA part?
    Similarly, from word cloud, it seems all types have similar dominant words
    Setiment Score distribution is highly skewed? Should we just remove this feature, or any other ways to normalize this feature

- Featuring Engineering:
    - Count Vectors as featuress or TF-IDF Vectors as features(N-gram Level TFIDF, set ngram=(1,2)
    - Text, NLP based features(word count, upper case count, number count, etc)

* I vs E, N vs S are highly imbalanced, which will affect the models. Do we need to apply some techiniques mentioned here?reference: https://elitedatascience.com/imbalanced-classes

- Modeling:
    - Naive Bayes Classifer(in FirstModel file)
    
    - KNN
    - Baggings -Random Forest
        - Random Search CV on RF:
            - {'bootstrap': [True, False],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}    
        - Best Parameters:
        
        - Results: F1-score = , ROC-AUC = 


- Performance Metrics
    - Accuracy(removed as imbalanced classes), F1-score, ROC-AUC which one should we prioritize for our project? 
    -  From the production perspective, we can add matrics like computation cost, time complexity?

## General info:
#### Personality Types:

In [7]:
# save time, run this to load the clean post
processed_post = pd.read_csv('data/mbti_preprocessed_1.csv')
processed_post.drop('Unnamed: 0',axis=1,inplace=True)
processed_post.head()

Unnamed: 0,type,posts,processed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play pr...
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring positi...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric ...
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approachi...


### Vectorize the posts

Create a bag of words representation of each user by using tfidf

In [8]:
# TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(min_df=0.05, max_df=0.85, analyzer='word', ngram_range=(1, 2))
word_tfidf = vectorizer_tfidf.fit_transform(processed_post['processed_posts'])
word_tfidf_df = pd.DataFrame(data = word_tfidf.toarray(), columns = vectorizer_tfidf.get_feature_names())
# CountVectorizer
vectorizer_ct = CountVectorizer(stop_words='english',analyzer='word',input='content', 
                                 decode_error='ignore', max_df=0.48,min_df=5,
                                 token_pattern=r'\w{1,}', max_features=1625, ngram_range=(1,2)) # to compare two methods, I limit max_features=1625
word_ct = vectorizer_ct.fit_transform(processed_post['processed_posts'])
word_ct_df = pd.DataFrame(data = word_ct.toarray(), columns = vectorizer_ct.get_feature_names())

In [11]:
# word_tfidf_df.head()

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,across,...,year ago,year old,yep,yes,yesterday,yet,young,younger,youtube,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067997,0.0,0.083075,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.038307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.12246,0.0444,0.0,0.106856,0.0,0.0,0.0,0.0,0.064077,0.0,...,0.0,0.063801,0.0,0.060355,0.0,0.0,0.0,0.0,0.0,0.081823
3,0.0,0.071834,0.066683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.059121,0.0,0.055929,0.0,0.0,0.0,0.0,0.0,0.0


## Models: KNN, Random Forest

In [9]:
mbti = pd.read_csv("data/mbti_FE.csv")

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#from random import choice
#import lightgbm as lgb
#import gc
#from xgboost.sklearn import XGBClassifier

In [11]:
def model(model, X, target, nsplits=4):
    kf = StratifiedShuffleSplit(n_splits=nsplits, random_state=420)
    
    types = {'EorI':'Extroversion vs. Introversion', 'NorS': 'Intuition vs. Sensing',
                 'TorF': 'Thinking vs. Feeling','JorP': 'Judging vs. Perceiving'}
    t = time.time()
    for col in target.columns:
        print(f"{types[col]}:")
        y = target[col]
        all_auc = []
        # all_accuracies = []
        f_score = []
        for train, test in kf.split(X,y):
            X_train, X_test, y_train, y_test = X.loc[train], X.loc[test], y[train], y[test]
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            # get the probability of prediction for auc score
            preds_act = model.predict_proba(X_test)[:,1]
            
            # preds = model.predict(X_test)
            auc = roc_auc_score(y_test, preds_act)
            all_auc.append(auc)
                    
            fscore = f1_score(preds,y_test)
            f_score.append(fscore)
            model_name = str(model).split('(')[0]
        print(f'Average AUC: {np.mean(all_auc):.3f}; Average fscore: {np.mean(f_score):.3f}')
    print(f"Time use:{time.time()-t:.3f}s")
    

In [12]:
# vectorizers and classifiers
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(random_state=42,min_samples_split=5,
                            max_depth=80, criterion='gini',max_features='auto')
target = mbti.iloc[:,2:6]
# X_tf = np.column_stack((mbti.iloc[:,6:].drop('Sentiment',axis=1),word_tfidf_df))
# X_ct = np.column_stack((mbti.iloc[:,6:].drop('Sentiment',axis=1),word_ct_df))

X_tf = pd.concat([mbti.iloc[:,6:],word_tfidf_df],axis=1)
X_ct = pd.concat([mbti.iloc[:,6:],word_ct_df],axis=1)

## Bagging Classifier

### Using TFIDF

In [14]:
bagging = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5, max_features=0.5)
model(bagging, X_tf, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.561; Average fscore: 0.034
Intuition vs. Sensing:
Average AUC: 0.513; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.652; Average fscore: 0.534
Judging vs. Perceiving:
Average AUC: 0.503; Average fscore: 0.709
Time use:144.734s


### Using Counter

In [None]:
model(bagging, X_ct, target, nsplits=4)

Extroversion vs. Introversion:


### Random Forest on tfidf

In [110]:
model(rf, X_tf, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.567; Average fscore: 0.102
Intuition vs. Sensing:
Average AUC: 0.564; Average fscore: 0.024
Thinking vs. Feeling:
Average AUC: 0.716; Average fscore: 0.591
Judging vs. Perceiving:
Average AUC: 0.552; Average fscore: 0.663
Time use:17.391s


### Random Forest on counter 

In [96]:
#{'n_estimators': 1600, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 80, 'bootstrap': False}
model(rf, X_ct, target, nsplits=4)

Extroversion vs. Introversion:
Average AUC: 0.685; Average fscore: 0.000
Intuition vs. Sensing:
Average AUC: 0.660; Average fscore: 0.000
Thinking vs. Feeling:
Average AUC: 0.833; Average fscore: 0.701
Judging vs. Perceiving:
Average AUC: 0.678; Average fscore: 0.758
Time use:1104.125s


### KNN on tfidf ngram=(1,2)

In [97]:
model(knn, X_tf, target, nsplits=5)

Extroversion vs. Introversion:
Average AUC: 0.540; Average fscore: 0.222
Intuition vs. Sensing:
Average AUC: 0.508; Average fscore: 0.097
Thinking vs. Feeling:
Average AUC: 0.603; Average fscore: 0.555
Judging vs. Perceiving:
Average AUC: 0.498; Average fscore: 0.629
Time use:60.981s


### KNN on counter Vectorizer

In [98]:
model(knn, X_ct, target, nsplits=5)

Extroversion vs. Introversion:
Average AUC: 0.538; Average fscore: 0.216
Intuition vs. Sensing:
Average AUC: 0.530; Average fscore: 0.099
Thinking vs. Feeling:
Average AUC: 0.601; Average fscore: 0.546
Judging vs. Perceiving:
Average AUC: 0.504; Average fscore: 0.633
Time use:185.176s


### Random Search on Random Forest

In [99]:
def model_rs(model, X, target, nsplits=1):
    kf = StratifiedShuffleSplit(nsplits,random_state=420)
    
    types = {'EorI':'Extroversion vs. Introversion', 'NorS': 'Intuition vs. Sensing',
                 'TorF': 'Thinking vs. Feeling','JorP': 'Judging vs. Perceiving'}
    t = time.time()
    for col in target.columns:
        print(f"{types[col]}:")
        y = target[col]
        all_auc = []
        # all_accuracies = []
        f_score = []
        for train, test in kf.split(X,y):
            X_train, X_test, y_train, y_test = X.loc[train], X.loc[test], y[train], y[test]
            model.fit(X_train, y_train)
            print(model.best_params_)
            preds = model.predict(X_test)
            # get the probability of prediction for auc score
            preds_act = model.predict_proba(X_test)[:,1]
            # preds = model.predict(X_test)
            auc = round(roc_auc_score(y_test, preds_act),3)
            f1score = f1_score(preds,y_test)
            model_name = str(model).split('(')[0]
        
        print(f'AUC: {auc:.3f}; f1_score: {f1score:.3f}')
    print(f"Time use:{time.time()-t:.3f}s")

In [100]:
'''
In contrast to GridSearchCV, not all parameter values are tried out, 
but rather a fixed number of parameter settings is sampled from the specified distributions. 
The number of parameter settings that are tried is given by n_iter.
'''

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# pprint(random_grid)

#{'bootstrap': [True, False],
# 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
# 'max_features': ['auto'],
# 'min_samples_leaf': [1, 2, 4], default=1 
# 'min_samples_split': [2, 5, 10], default=2
# 'n_estimators': [100, 200,600, 800, 1000, 1200, 1600, 2000]} default=100, The number of trees in the forest.

In [101]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
# should try n_iter 50?
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, 
                               verbose=2, random_state=420, n_jobs = -1)
'''
n_jobs = -1 means using all processors
cv = 3 : 3-fold cross validation
n_iter = number of parameter settings that are sampled, this trades off runtime
verbose = 2 the higher, the more messages
'''

# Fit the random search model
'''
More iterations will cover a wider search space and more cv folds reduces the chances of overfitting, 
but raising each will increase the run time. 
Machine learning is a field of trade-offs, and performance vs time is one of the most fundamental.
'''
model_rs(rf_random, X_ct, target, nsplits=1)

Extroversion vs. Introversion:
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  9.8min finished


{'n_estimators': 1525, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}
AUC: 0.700; f1_score: 0.000
Intuition vs. Sensing:
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 10.3min finished


{'n_estimators': 575, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 80, 'bootstrap': False}
AUC: 0.685; f1_score: 0.000
Thinking vs. Feeling:
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  8.8min finished


{'n_estimators': 1525, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}
AUC: 0.838; f1_score: 0.718
Judging vs. Perceiving:
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  9.9min finished


{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 40, 'bootstrap': False}
AUC: 0.650; f1_score: 0.750
Time use:2558.058s


### Next Step

- RF and KNN models **DONE**
- Visualization to compare the model performances
- How to explain EorI things to the auidence in an easy way
- PCA 