# Meyers Briggs Type Indicator (MBTI) Project

## By The Introverts 
1. Nan Lin
2. Zack Pan
3. Ben Khuong
4. Tomohiko Ishihara
5. Donya Fozoonmayeh

In [195]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pd

## Load Data

#### TFIDF Feature Set: 

In [17]:
# Load processed text data
mbti_processed = pd.read_csv('data/mbti_preprocessed_1.csv', index_col=0)
mbti_processed.tail()


Unnamed: 0,type,posts,processed_posts
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,ixfp always think cat fi doms reason esp...
8671,ENFP,'So...if this thread already exists someplace ...,thread already exists someplace else he...
8672,INTP,'So many questions when i do these things. I ...,many question thing would take purple pill ...
8673,INFP,'I am very conflicted right now when it comes ...,conflicted right come wanting child honestl...
8674,INFP,'It has been too long since I have been on per...,long since personalitycafe although seem ch...


In [262]:
# Extract TFIDF scores from mbti_processed
vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.85, analyzer='word', ngram_range=(1, 2))
word_count = vectorizer.fit_transform(mbti_processed['processed_posts'])
mbti_tfidf = pd.DataFrame(data=word_count.toarray(),
                          columns=vectorizer.get_feature_names())
mbti_tfidf.head()

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,across,...,year ago,year old,yep,yes,yesterday,yet,young,younger,youtube,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067997,0.0,0.083075,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.038307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.12246,0.0444,0.0,0.106856,0.0,0.0,0.0,0.0,0.064077,0.0,...,0.0,0.063801,0.0,0.060355,0.0,0.0,0.0,0.0,0.0,0.081823
3,0.0,0.071834,0.066683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.059121,0.0,0.055929,0.0,0.0,0.0,0.0,0.0,0.0


#### Nominalizing Types:

In [254]:
# Types per category
# analyze types by sub-category may be easier
mbti_processed['EorI']= mbti_processed['type'].apply(lambda x:x[0])
mbti_processed['NorS']= mbti_processed['type'].apply(lambda x:x[1])
mbti_processed['TorF']= mbti_processed['type'].apply(lambda x:x[2])
mbti_processed['JorP']= mbti_processed['type'].apply(lambda x:x[3])

In [255]:
# A type consists of 4 capitals and each capital corresponds to 2 possible characteristics.
# For later encoding and modeling issues, let's tranfer them into separate binary code.
type_map = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
# transfer column 3-6 into binary code.
def type_preprocess(df):
    for i in range(3,7):
        df.iloc[:,i] = df.iloc[:,i].map(type_map)
    return df

In [256]:
mbti_processed = type_preprocess(mbti_processed)

In [257]:
mbti_processed.head()

Unnamed: 0,type,posts,processed_posts,EorI,NorS,TorF,JorP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play pr...,0,0,0,0
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring positi...,1,0,1,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing...,0,0,1,1
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric ...,0,0,1,0
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approachi...,1,0,1,0


## Fit scikit-learn model

In [258]:
from numpy import mean
from helperfunctions.PrettyConfusionMatrix import print_cm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [259]:
def multinomialnb_model(model, X, target, nsplits=4):
    kf = StratifiedShuffleSplit(n_splits=nsplits, random_state=420)
    
    types = {'EorI':'Extroversion vs. Introversion', 'NorS': 'Intuition vs. Sensing',
                 'TorF': 'Thinking vs. Feeling','JorP': 'Judging vs. Perceiving'}

    for col in target.columns:
        print(f"This is the training for {types[col]}:")
        y = target[col]
        all_fscores = []
        all_accuracies = []
        for train, test in kf.split(X,y):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            probs = np.ones((len(y_test), 16))
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            fscore = f1_score(y_test, preds, average='weighted')
            all_fscores.append(fscore)
            accuracy = accuracy_score(preds, y_test)
            all_accuracies.append(accuracy)
            model_name = str(model).split('(')[0]
        print(f'Average F1-score: {mean(all_fscores):.3f}; Average Accuracy: {mean(all_accuracies):.3f}')
        print_cm(confusion_matrix(y_test, preds,[1,0]),['1','0'])
        

In [260]:
# initialize
MNB = MultinomialNB()
target = mbti_processed.iloc[:,3:7]
X = np.array(mbti_tfidf)

## Evaluation Metric

In [261]:
multinomialnb_model(MNB, X, target, nsplits=5)

This is the training for Extroversion vs. Introversion:
Average F1-score: 0.669; Average Accuracy: 0.770
              1     0 
        1   0.0 200.0 
        0   0.0 668.0 
This is the training for Intuition vs. Sensing:
Average F1-score: 0.798; Average Accuracy: 0.862
              1     0 
        1   0.0 120.0 
        0   0.0 748.0 
This is the training for Thinking vs. Feeling:
Average F1-score: 0.744; Average Accuracy: 0.747
              1     0 
        1 252.0 146.0 
        0  72.0 398.0 
This is the training for Judging vs. Perceiving:
Average F1-score: 0.518; Average Accuracy: 0.626
              1     0 
        1 514.0  10.0 
        0 312.0  32.0 
