In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('mbti_short.csv')
df.type = pd.Categorical(pd.factorize(df.type)[0] + 1)
df = df[['post', 'type']]
df.head()

Unnamed: 0,post,type
0,welcome welcome welcome!! ^^ your type rocks.....,1
1,Just realized I completely missed a letter a t...,1
2,Enough to fit a suitcase full of drugs.,1
3,Thanks! I've been working on opening up a bit ...,1
4,Ok...so im fully aware of my power of persuasi...,1


## Classification with 16 classes ##

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['post'],
                                                    df['type'],
                                                    test_size=0.2,
                                                    random_state=77)

In [4]:
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1,3),
                             min_df=1)

In [5]:
X_train_vectorized = vectorizer.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_vectorized = vectorizer.transform(X_test.apply(lambda x: np.str_(x)))
X_train_vectorized
len(vectorizer.get_feature_names())

723155

### Logistic Model ###

In [6]:
scoring = {'acc': 'accuracy',
           'f1_micro': 'f1_micro'}

#classify(classifier, vectorizer)

pipe = Pipeline([('tfidf1', vectorizer), ('lr', LogisticRegression(class_weight="balanced", C=0.005))])

pipe.fit(X_train.apply(lambda x: np.str_(x)), y_train)


Pipeline(steps=[('tfidf1',
                 TfidfVectorizer(ngram_range=(1, 3), stop_words='english')),
                ('lr', LogisticRegression(C=0.005, class_weight='balanced'))])

In [7]:
y_pred = pipe.predict(X_test.apply(lambda x: np.str_(x)))

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Raport klasyfikacji Logistic: ")
print(classification_report( y_pred, y_test))

Raport klasyfikacji Logistic: 
              precision    recall  f1-score   support

           1       0.20      0.21      0.21       341
           2       0.22      0.14      0.17       551
           3       0.21      0.26      0.23       315
           4       0.14      0.16      0.15       291
           5       0.29      0.27      0.28       422
           6       0.12      0.21      0.16       223
           7       0.22      0.20      0.21       420
           8       0.14      0.31      0.20       166
           9       0.27      0.13      0.18       827
          10       0.15      0.15      0.15       396
          11       0.23      0.13      0.16       710
          12       0.19      0.17      0.18       447
          13       0.12      0.26      0.16       172
          14       0.15      0.19      0.17       320
          15       0.15      0.22      0.18       251
          16       0.12      0.19      0.15       251

    accuracy                           0.18      

### XGB ###

In [9]:
clf = XGBClassifier(eval_metric="logloss")
clf.fit(X_train_vectorized, y_train)
y_pred = clf.predict(X_test_vectorized)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Raport klasyfikacji XGB: ")
print(classification_report( y_pred, y_test))

Raport klasyfikacji XGB: 
              precision    recall  f1-score   support

           1       0.20      0.23      0.21       322
           2       0.16      0.17      0.16       328
           3       0.17      0.17      0.17       390
           4       0.12      0.10      0.11       438
           5       0.24      0.26      0.25       376
           6       0.21      0.11      0.14       700
           7       0.19      0.23      0.21       318
           8       0.17      0.17      0.17       358
           9       0.14      0.16      0.15       334
          10       0.16      0.18      0.17       369
          11       0.10      0.13      0.12       315
          12       0.16      0.14      0.15       432
          13       0.16      0.18      0.17       356
          14       0.16      0.17      0.16       379
          15       0.17      0.20      0.19       308
          16       0.14      0.14      0.14       380

    accuracy                           0.17      6103


In [11]:
pickle.dump(vectorizer, open("vectorizer_xgb_mbti.sav", 'wb'))
pickle.dump(clf, open("model_XGB.sav", 'wb'))

In [12]:
def analyse_tweet(tweet):
    if type(tweet) != list:
        tweet = [tweet]
        
    loaded_vectorizer = pickle.load(open("vectorizer_xgb_mbti.sav", 'rb'))

    loaded_model= pickle.load(open("model_XGB.sav", 'rb'))
    input_vect = vectorizer.transform(tweet)

    result_type = loaded_model.predict(input_vect)
    
    print("The result is: ", result_type)
    
    return result_type


analyse_tweet("Kanye is a typical example of ‘most’ men always realizing when the person they took for granted moved on! Rooting for KimYe 💯 but Kim Kardashian deserves to be happy as well 📌😍 she loved Ye and fought for that marriage ")
analyse_tweet("We also passed the Bipartisan Infrastructure Law, which is going to make it easier for businesses to move goods and reach more customers than ever before.My Build Back Better Act will go even further.I’m looking forward to shopping small tomorrow, and I hope you are too.")
analyse_tweet("Mike Pence didn’t have the courage to do what should have been done to protect our Country and our Constitution, giving States a chance to certify a corrected set of facts, not the fraudulent or inaccurate ones which they were asked to previously certify. USA demands the truth!")

The result is:  [10]
The result is:  [3]
The result is:  [4]


array([4], dtype=int64)

##  Classification 4 x binary instead of 16 classes ##

In [19]:
df = pd.read_csv('mbti_short.csv')
df = df[['post', 'type']]


print(df['type'].unique())

['ENFJ' 'ENFP' 'ENTJ' 'ENTP' 'ESFJ' 'ESFP' 'ESTJ' 'ESTP' 'INFJ' 'INFP'
 'INTJ' 'INTP' 'ISFJ' 'ISFP' 'ISTJ' 'ISTP']


In [20]:
train, test = train_test_split(df, test_size=0.2,random_state=0)

In [21]:
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1,1),
                             min_df=2)

In [22]:
X_train_vectorized = vectorizer.fit_transform(train['post'].apply(lambda x: np.str_(x)))
X_test_vectorized = vectorizer.transform(test['post'].apply(lambda x: np.str_(x)))

In [23]:
type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) – Sensing (S)", 
                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"  ]

for l in range(len(type_indicators)):
    print(type_indicators[l])

IE: Introversion (I) / Extroversion (E)
NS: Intuition (N) – Sensing (S)
FT: Feeling (F) - Thinking (T)
JP: Judging (J) – Perceiving (P)


In [24]:
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    
    return [b_Pers[l] for l in personality]

def translate_back(personality):
    # transform binary vector to mbti personality
    
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s


# Posts in tf-idf representation
X_train = X_train_vectorized
classes_train = np.array([translate_personality(p) for p in train.type])

X_test = X_test_vectorized
classes_test = np.array([translate_personality(p) for p in test.type])

In [25]:
import warnings
warnings.filterwarnings("ignore")
import pickle
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

result=[]
# Let's train type indicator individually
for l in range(len(type_indicators)):
    print("%s ..." % (type_indicators[l]))
    
    # Let's train type indicator individually
    y_train = classes_train[:,l]
    y_test = classes_test[:,l]

    # fit model on training data
    model =  XGBClassifier()
    model.fit(X_train, y_train)

    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    
    filename = 'model_' +type_indicators[l][0:2]+'.sav'
    pickle.dump(model, open(filename, 'wb'))

    print(" \n %s Accuracy: %.2f \n" % (type_indicators[l], accuracy * 100.0))

IE: Introversion (I) / Extroversion (E) ...
 
 IE: Introversion (I) / Extroversion (E) Accuracy: 58.02 

NS: Intuition (N) – Sensing (S) ...
 
 NS: Intuition (N) – Sensing (S) Accuracy: 61.28 

FT: Feeling (F) - Thinking (T) ...
 
 FT: Feeling (F) - Thinking (T) Accuracy: 60.81 

JP: Judging (J) – Perceiving (P) ...
 
 JP: Judging (J) – Perceiving (P) Accuracy: 56.45 



In [26]:
pickle.dump(vectorizer, open("vectorizer_mbti.sav", 'wb'))

In [27]:

def analyse_tweet(tweet):
    if type(tweet) != list:
        tweet = [tweet]
        
    loaded_vectorizer = pickle.load(open("vectorizer_mbti.sav", 'rb'))

    loaded_model_IE = pickle.load(open("model_IE.sav", 'rb'))
    loaded_model_NS = pickle.load(open("model_NS.sav", 'rb'))
    loaded_model_FT = pickle.load(open("model_FT.sav", 'rb'))
    loaded_model_JP = pickle.load(open("model_JP.sav", 'rb'))

    input_vect = vectorizer.transform(tweet)
    
    result_bin = []
    result_bin.append(loaded_model_IE.predict(input_vect)[0])
    result_bin.append(loaded_model_NS.predict(input_vect)[0])
    result_bin.append(loaded_model_FT.predict(input_vect)[0])
    result_bin.append(loaded_model_JP.predict(input_vect)[0])

    result_type = translate_back(result_bin)
    
    print("The result is: ", result_type)
    
    return result_type


analyse_tweet("Kanye is a typical example of ‘most’ men always realizing when the person they took for granted moved on! Rooting for KimYe 💯 but Kim Kardashian deserves to be happy as well 📌😍 she loved Ye and fought for that marriage ")
analyse_tweet("We also passed the Bipartisan Infrastructure Law, which is going to make it easier for businesses to move goods and reach more customers than ever before.My Build Back Better Act will go even further.I’m looking forward to shopping small tomorrow, and I hope you are too.")
analyse_tweet("Mike Pence didn’t have the courage to do what should have been done to protect our Country and our Constitution, giving States a chance to certify a corrected set of facts, not the fraudulent or inaccurate ones which they were asked to previously certify. USA demands the truth!")

The result is:  ENFP
The result is:  INFJ
The result is:  ISTP


'ISTP'