In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import pandas as pd

# Load Data

In [30]:
mbti_processed = pd.read_csv('data/mbti_preprocessed_ben2.csv')
mbti_features = pd.read_csv('data/mbti_FE_ben.csv')
mbti_processed.drop('Unnamed: 0',axis=1,inplace=True) # remove unamed column? 
mbti_features.drop('Unnamed: 0',axis=1,inplace=True) # remove unamed column? 
mbti_processed['split_posts'] = mbti_processed['split_posts'].apply(lambda x: eval(x))

In [31]:
mbti_processed.head()

Unnamed: 0,type,posts,processed_posts,split_posts,processed_post_type
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play pr...,"[moment sportscenter top ten play prank, life ...",enfp intj moment sportscenter top ten pl...
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring positi...,"[finding lack post alarming, sex boring positi...",finding lack post alarming sex boring positi...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing...,"[good one course say know blessing curse, abso...",good one course say know blessing...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric ...,[dear enjoyed conversation day esoteric gabbin...,dear intp enjoyed conversation day esote...
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approachi...,"[fired, another silly misconception approachin...",fired another silly misconception approachi...


In [7]:
mbti_features.head()

Unnamed: 0,type,posts,EorI,NorS,TorF,JorP,avg_comment_length,comment_length_var,Sentiment,ellipses,...,upper,num_posts,Avg_Sentiment,avg_ellipses,avg_exclamation,avg_question,avg_links,avg_picture,avg_emojies,avg_upper
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,0,0,0,11.12,135.29,0.9924,8,...,13,32,0.205894,0.25,0.09375,0.5625,0.75,0.1875,0.125,0.40625
1,ENTP,'I'm finding the lack of me in these posts ver...,1,0,1,1,23.4,187.4756,0.9987,18,...,82,45,0.276524,0.4,0.0,0.111111,0.2,0.177778,0.311111,1.822222
2,INTP,'Good one _____ https://www.youtube.com/wat...,0,0,1,1,16.72,180.69,0.9985,13,...,26,38,0.238034,0.342105,0.105263,0.315789,0.105263,0.0,0.263158,0.684211
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0,0,1,0,21.28,181.8324,0.9985,23,...,57,48,0.246535,0.479167,0.0625,0.229167,0.041667,0.0,0.0,1.1875
4,ENTJ,'You're fired.|||That's another silly misconce...,1,0,1,0,19.34,196.4576,0.9914,20,...,35,45,0.117564,0.444444,0.022222,0.222222,0.133333,0.044444,0.022222,0.777778


In [8]:
# Chose TfidfVectorizer with min_df = 25 because there were some odd cases like 'aaaaa' 
# max_df=.8 to filter common words. Max_features to set limit for the length of each vector
# here I take 5000 just compared to Ben's 7857, 5000 means 5000 words with higher frequency among words. 
# I don't dare to filter out some word less than 3 characters 
# since there are some acronym or words like 'aha' which may contain positive info.
vectorizer = TfidfVectorizer(max_df=.8, min_df=25)
word_count = vectorizer.fit_transform(mbti_processed['processed_posts'])
# Create word_count dataframe
mbti_tfidf = pd.DataFrame(data = word_count.toarray(), columns = vectorizer.get_feature_names())
mbti_tfidf.head()

Unnamed: 0,aa,ab,aback,abandon,abandoned,abandoning,abandonment,abbey,abbreviation,abhor,...,zelda,zen,zero,zodiac,zombie,zone,zoned,zoning,zoo,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.06229,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Additional Cleaning

* Naive Bayes doesn't take negative numbers. 
* Sentiment Scores have NaNs

In [9]:
# NaNs were found: 
mbti_features.fillna(value=0, inplace=True)

In [12]:
# Naive Bayes can't handle negatives? Scale with MinMax 
min_max_scaler = MinMaxScaler()
test = np.array(mbti_features['Avg_Sentiment']).reshape(-1,1)
avg_sentiment_scaled = min_max_scaler.fit_transform(test)
mbti_features['Avg_Sentiment_Scaled'] = avg_sentiment_scaled

# Fit Model

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [18]:
def base_model(model, X, target, nsplits=4):
    kf = StratifiedShuffleSplit(n_splits=nsplits, random_state=420)

    for col in target.columns:
        print(f"This is the training for {col}:")
        y = target[col]
        all_fscores = []
        all_accuracies = []
        for train, test in kf.split(X,y):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            probs = np.ones((len(y_test), 16))
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            fscore = f1_score(y_test, preds, average='weighted')
            all_fscores.append(fscore)
            accuracy = accuracy_score(preds, y_test)
            all_accuracies.append(accuracy)
            model_name = str(model).split('(')[0]
#             print(f'{model_name} F1-score: {fscore}; Accuracy: {accuracy}')
        print(f'Average F1-score: {np.mean(all_fscores)}; Average Accuracy: {np.mean(all_accuracies)}')

In [20]:
# Initialize
target = mbti_features.iloc[:,2:6]

List of features and index position

In [210]:
features = list(mbti_features.columns)
list(zip(list(features), range(len(features))))

[('type', 0),
 ('posts', 1),
 ('EorI', 2),
 ('NorS', 3),
 ('TorF', 4),
 ('JorP', 5),
 ('avg_comment_length', 6),
 ('comment_length_var', 7),
 ('Sentiment', 8),
 ('ellipses', 9),
 ('exclamation', 10),
 ('question', 11),
 ('links', 12),
 ('picture', 13),
 ('emojies', 14),
 ('upper', 15),
 ('num_posts', 16),
 ('Avg_Sentiment', 17),
 ('avg_ellipses', 18),
 ('avg_exclamation', 19),
 ('avg_question', 20),
 ('avg_links', 21),
 ('avg_picture', 22),
 ('avg_emojies', 23),
 ('avg_upper', 24),
 ('Avg_Sentiment_Scaled', 25)]

## NB: Base Model (tfidf only)

In [None]:
MNB = MultinomialNB()
X = np.array(mbti_tfidf)

In [218]:
base_model(MNB, X, target, nsplits=5)

This is the training for EorI:
Average F1-score: 0.6699245925388893; Average Accuracy: 0.7698156682027649
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.7527058169378001; Average Accuracy: 0.758294930875576
This is the training for JorP:
Average F1-score: 0.511502409676997; Average Accuracy: 0.6205069124423963


## NB: Testing Individual Features

In [235]:
# Issue with avg_question... 
test_features = features[6:8] + features[9:17] + features[18:20] + features[21:]
test_features

['avg_comment_length',
 'comment_length_var',
 'ellipses',
 'exclamation',
 'question',
 'links',
 'picture',
 'emojies',
 'upper',
 'num_posts',
 'avg_ellipses',
 'avg_exclamation',
 'avg_links',
 'avg_picture',
 'avg_emojies',
 'avg_upper',
 'Avg_Sentiment_Scaled']

In [236]:
for feature in test_features:
    X = np.column_stack((mbti_features[feature],mbti_tfidf))
    print("Tfidf + %s" % feature)
    base_model(MNB, X, target, nsplits=5)
    print("\n")

Tfidf + avg_comment_length
This is the training for EorI:
Average F1-score: 0.6702436890230993; Average Accuracy: 0.7695852534562212
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.7360953140771856; Average Accuracy: 0.7451612903225807
This is the training for JorP:
Average F1-score: 0.49428918900597446; Average Accuracy: 0.6158986175115209


Tfidf + comment_length_var
This is the training for EorI:
Average F1-score: 0.6790147539423866; Average Accuracy: 0.7571428571428571
This is the training for NorS:
Average F1-score: 0.7955691435293819; Average Accuracy: 0.846774193548387
This is the training for TorF:
Average F1-score: 0.7324921041535697; Average Accuracy: 0.7410138248847926
This is the training for JorP:
Average F1-score: 0.5651051611643234; Average Accuracy: 0.6138248847926266


Tfidf + ellipses
This is the training for EorI:
Average F1-score: 0.6730895862486614; Average 

## NB: Smaller TFIDF

In [238]:
for max_tfidf in range(1000,10000,1000):
    vectorizer = TfidfVectorizer(max_df=.8, min_df=25, max_features=max_tfidf)
    word_count = vectorizer.fit_transform(mbti_processed['processed_posts'])
    # Create word_count dataframe
    mbti_tfidf = pd.DataFrame(data = word_count.toarray(), columns = vectorizer.get_feature_names())
    print("Tfidf with max %s" % max_tfidf)
    X = np.array(mbti_tfidf)
    base_model(MNB, X, target, nsplits=5)
    print("\n")

Tfidf with max 1000
This is the training for EorI:
Average F1-score: 0.6693788402457758; Average Accuracy: 0.7695852534562212
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.7314441999082126; Average Accuracy: 0.7364055299539171
This is the training for JorP:
Average F1-score: 0.4998581845798852; Average Accuracy: 0.6195852534562212


Tfidf with max 2000
This is the training for EorI:
Average F1-score: 0.6693788402457758; Average Accuracy: 0.7695852534562212
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.7476428626336877; Average Accuracy: 0.7513824884792626
This is the training for JorP:
Average F1-score: 0.5255250278294292; Average Accuracy: 0.626958525345622


Tfidf with max 3000
This is the training for EorI:
Average F1-score: 0.6693788402457758; Average Accuracy: 0.

## SVM: Base Model (tfidf only)

In [23]:
vectorizer = TfidfVectorizer(max_df=.8, min_df=25, max_features=1000)
word_count = vectorizer.fit_transform(mbti_processed['processed_posts'])
# Create word_count dataframe
mbti_tfidf = pd.DataFrame(data = word_count.toarray(), columns = vectorizer.get_feature_names())

In [26]:
SVM = SVC(kernel='linear')
X = np.array(mbti_tfidf)
base_model(SVM, X, target, nsplits=1)

This is the training for EorI:
Average F1-score: 0.7120901067412595; Average Accuracy: 0.7695852534562212
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.7742755720981527; Average Accuracy: 0.7741935483870968
This is the training for JorP:
Average F1-score: 0.636067507035249; Average Accuracy: 0.6497695852534562


In [27]:
SVM = SVC(kernel='rbf')
X = np.array(mbti_tfidf)
base_model(SVM, X, target, nsplits=1)

This is the training for EorI:
Average F1-score: 0.6693788402457758; Average Accuracy: 0.7695852534562212
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.38040820262721026; Average Accuracy: 0.5414746543778802
This is the training for JorP:
Average F1-score: 0.45449970867101014; Average Accuracy: 0.6036866359447005


In [28]:
SVM = SVC(kernel='poly')
X = np.array(mbti_tfidf)
base_model(SVM, X, target, nsplits=1)

This is the training for EorI:
Average F1-score: 0.6693788402457758; Average Accuracy: 0.7695852534562212
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.38040820262721026; Average Accuracy: 0.5414746543778802
This is the training for JorP:
Average F1-score: 0.45449970867101014; Average Accuracy: 0.6036866359447005


## NB: TFIDF with Types

In [32]:
vectorizer = TfidfVectorizer(max_df=.8, min_df=25, max_features=1000)
word_count = vectorizer.fit_transform(mbti_processed['processed_post_type'])
# Create word_count dataframe
mbti_tfidf_type = pd.DataFrame(data = word_count.toarray(), columns = vectorizer.get_feature_names())

In [33]:
MNB = MultinomialNB()
X = np.array(mbti_tfidf_type)

In [34]:
base_model(MNB, X, target, nsplits=5)

This is the training for EorI:
Average F1-score: 0.676763598801341; Average Accuracy: 0.7716589861751153
This is the training for NorS:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for TorF:
Average F1-score: 0.8077215240265397; Average Accuracy: 0.8099078341013826
This is the training for JorP:
Average F1-score: 0.5952533853614124; Average Accuracy: 0.6695852534562212
