In [1]:
import os, sys, time
import multiprocessing
import pickle
import re, string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

## Preprocess

In [2]:
data = pd.read_csv("mbti_1.csv")
n_users = len(data)
posts = data["posts"]
labels = data["type"].unique()
n_class = len(labels)
type2num = {label: i for i,label in enumerate(labels)}
Y = np.array(list(map(lambda s: type2num[s], data["type"].to_numpy())))

In [3]:
def plot_distribution():
    fig, ax = plt.subplots(figsize=(10,4))
    type_val = data["type"].value_counts()
    labels = type_val.keys()
    x = np.arange(len(labels))
    ax.bar(x, type_val.values)
    ax.set_ylabel("# of people")
    ax.set_xticks(x)
    ax.set_xticklabels(labels,rotation='45')
    ax.set_axisbelow(True)
    ax.yaxis.grid(color='gray', linestyle='dashed')
    fig.tight_layout()
    plt.show()

In [4]:
def generate_posts(path=""):
    filename = os.path.join(path,"posts.pkl")
    user_posts = []
    if not os.path.isfile(filename):
        stopwords = pd.read_csv("stopwords.csv").to_numpy().reshape(-1)
        stopwords = np.array(list(map(lambda s: s.replace("'",""),stopwords)))
        for uid in range(n_users):
            # add empty space first (better used for regex parsing)
            new_post = posts[uid].replace("|||"," ||| ")
            new_post = new_post.replace(",",", ")
            # remove url links
            new_post = re.sub("(http|https):\/\/.*?( |'|\")","",new_post)
            # avoid words in two sentences merged together after removing spaces
            new_post = new_post.replace(".",". ")
            # change emoji to word
            new_post = new_post.replace(":)"," smile ")
            new_post = new_post.replace(":("," sad ")
            # remove useless numbers and punctuations
            new_post = re.sub(r"[0-9]+", "", new_post)
            new_post = new_post.translate(str.maketrans('', '', string.punctuation))
            # remove redundant empty spaces
            new_post = re.sub(" +"," ",new_post).strip()
            # make all characters lower
            new_post = new_post.lower()
            temp = []
            # remove stopping words
            for word in new_post.split():
                if len(word) != 1 and word not in stopwords:
                    temp.append(word)
            user_posts.append(temp)
            if uid * 100 % n_users == 0:
                print("Done {}/{} = {}%".format(uid,n_users,uid*100/n_users))
        print("Finished generating word list")
        pickle.dump(user_posts,open(filename,"wb"))
        with open("posts.corpus","w") as corpus:
            for post in user_posts:
                corpus.write(" ".join(post) + "\n")
        print("Saved to posts.corpus!")
    else:
        user_posts = pickle.load(open(filename,"rb"))
        print("Loaded user posts")
    return user_posts

user_posts = generate_posts()

Loaded user posts


## Generate TF-IDF model

In [5]:
def generate_tfidf(user_posts, path="", retrain=False):
    filename = os.path.join(path,"tfidf.npy")
    if not os.path.isfile(filename) or retrain:
        word_lst = []
        for post in user_posts:
            word_lst += post

        # make dictionary (used for TF-IDF)
        word_counts = Counter(word_lst)
        # remove words that don't occur too frequently
        print("# of words before:",len(word_counts))
        for word in list(word_counts): # avoid changing size
            if word_counts[word] < 6:
                del word_counts[word]
        print("# of words after:",len(word_counts))

        # generate IDF value
        sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
        int_to_word = {k: w for k, w in enumerate(sorted_vocab)}
        word_to_int = {w: k for k, w in int_to_word.items()}
        np.save("int2word.npy",int_to_word)
        idf = np.zeros((len(sorted_vocab),))
        for uid, post in enumerate(user_posts):
            set_words = set(post) # avoid duplication
            for word in set_words:
                if word in sorted_vocab:
                    idf[word_to_int[word]] += 1 # count frequency
            if uid * 100 % n_users == 0:
                print("Done {}/{} = {}%".format(uid,n_users,uid*100/n_users))
        idf = np.log(len(user_posts) / (idf + 1)) # avoid divided by 0
        print("Finished generating IDF values")
        np.save("idf.npy",idf)

        # generate TF value
        tfidf_values = np.zeros((len(user_posts),len(idf)))
        for i, post in enumerate(user_posts):
            for post_word in post:
                idx = word_to_int.get(post_word,None)
                if idx != None:
                    tfidf_values[i][idx] += 1
            if len(post) != 0:
                tfidf_values[i] /= len(post)
        print("Finished generating TF values")
        tfidf_values *= idf
        print(tfidf_values.shape)
        np.save(filename,tfidf_values)
        print("Saved to {}!".format(filename))
    else:
        tfidf_values = np.load(filename,allow_pickle=True)
        print("Loaded {}".format(filename))
    n_words = tfidf_values.shape[1]
    print('Vocabulary size:', n_words)
    return tfidf_values

tfidf = generate_tfidf(user_posts)

Loaded tfidf.npy
Vocabulary size: 27127


## Generate Word2Vec model

In [6]:
import multiprocessing
from gensim.models import Word2Vec
EMBEDDING_SIZE=1024

In [7]:
def train_w2v(RETRAIN=False):
    if not os.path.isfile("posts.bin") or RETRAIN:
        print("Training Word2Vec Model ...")
        start = time.time()
        w2v = Word2Vec(corpus_file="posts.corpus",size=EMBEDDING_SIZE,
                       window=5,min_count=1,iter=40,
                       workers=multiprocessing.cpu_count())
        end = time.time()
        print("Word2Vec Time: {:.2f}s".format(end - start))
        w2v.save("posts.bin")
        w2v.wv.save_word2vec_format("posts.vec",binary=False)
    else:
        w2v = Word2Vec.load("posts.bin")
        print("Loaded pretrained Word2Vec Model")
    return w2v

wv = train_w2v()

Loaded pretrained Word2Vec Model


In [8]:
def aggregate_wv(n_users,user_posts,w2v,filename,RETRAIN=False):
    if not os.path.isfile(filename) or RETRAIN:
        print("Generating sentence vectors ...")
        begin_time = time.time()
        user_vec = np.zeros((n_users,EMBEDDING_SIZE))
        for uid, post in enumerate(user_posts):
            cnt = 0
            for uid, word in enumerate(post):
                try:
                    user_vec[uid] += w2v.wv[word]
                    cnt += 1
                except:
                    pass
            if cnt != 0: # avoid divided by 0
                user_vec[uid] /= cnt
            if uid * 10 % n_users == 0:
                print("Done {}%={}/{}".format(uid*100//n_users, uid, n_users),flush=True)
        end_time = time.time()
        np.save(filename,user_vec)
        print("Finished! Time: {:.2f}s".format(end_time - begin_time))
    else:
        user_vec = np.load(filename)
        print("Loaded aggregated sentence vectors ({})".format(filename))
    return user_vec

X_vec = aggregate_wv(n_users,user_posts,wv,"w2v.feat")
print(X_vec.shape)

Generating sentence vectors ...
Finished! Time: 17.10s
(8675, 1024)


## Random Forest model

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import resample
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [13]:
def split_balanced(data, target, test_size=0.2):
    classes = np.unique(target)
    n_test = np.round(len(target) * test_size)
    n_train = max(0, len(target) - n_test)
    n_train_per_class = max(1, int(np.floor(n_train / len(classes))))
    n_test_per_class = max(1, int(np.floor(n_test / len(classes))))

    idxs = []
    for cl in classes:
        n_in_class = np.sum(target == cl)
        n_train_per_class = np.round(n_in_class * (1 - test_size))
        n_test_per_class = max(0, n_in_class - n_train_per_class)
        if (n_train_per_class + n_test_per_class) > np.sum(target == cl):
            # if data has too few samples for this class, do upsampling
            # split the data to training and testing before sampling so data points won't be
            # shared among training and test data
            splitidx = int(np.ceil(n_train_per_class / (n_train_per_class+n_test_per_class) * np.sum(target == cl)))
            idxs.append(np.r_[np.random.choice(np.nonzero(target == cl)[0][:splitidx], n_train_per_class),
                np.random.choice(np.nonzero(target == cl)[0][splitidx:], n_test_per_class)])
        else:
            print(cl)
            idxs.append(np.random.choice(np.nonzero(target == cl)[0], n_train_per_class+n_test_per_class,
                replace=False))

    # take same num of samples from all classes
    idx_train = np.concatenate([x[:n_train_per_class] for x in idxs])
    idx_test = np.concatenate([x[n_train_per_class:(n_train_per_class+n_test_per_class)] for x in idxs])

    X_train = data[idx_train,:]
    X_test = data[idx_test,:]
    y_train = target[idx_train]
    y_test = target[idx_test]

    return X_train, X_test, y_train, y_test

In [14]:
X = tfidf
# X = X_vec
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
# sss = StratifiedShuffleSplit(test_size=0.3)
X_train, X_test, y_train, y_test = split_balanced(X, Y)

In [20]:
def random_forest(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(n_jobs=multiprocessing.cpu_count()) # use all processors
    clf.fit(X_train, y_train)
    predict = clf.score(X_test, y_test)
    print("Random Forest acc: {:.2f}%".format(predict * 100))
    y_pred = clf.predict(X_test)
    print(classification_report(y_test,y_pred,target_names=labels))
    return clf

In [21]:
rf = random_forest(X_train,y_train,X_test,y_test)

Random Forest acc: 50.06%
              precision    recall  f1-score   support

        INFJ       0.37      0.71      0.49       108
        ENTP       0.40      0.64      0.49       108
        INTP       0.37      0.69      0.48       108
        INTJ       0.38      0.66      0.48       108
        ENTJ       0.51      0.33      0.40       108
        ENFJ       0.87      0.56      0.69       108
        INFP       0.28      0.53      0.37       108
        ENFP       0.41      0.60      0.49       108
        ISFP       0.69      0.63      0.66       108
        ISTP       0.64      0.56      0.60       108
        ISFJ       0.70      0.57      0.63       108
        ISTJ       0.86      0.56      0.68       108
        ESTP       0.94      0.42      0.58       108
        ESFP       0.94      0.14      0.24       108
        ESTJ       1.00      0.18      0.30       108
        ESFJ       1.00      0.22      0.36       108

    accuracy                           0.50      1728


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=48, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [16]:
def SVMClassifier(X_train, y_train, X_test, y_test):
    clf = svm.SVC(kernel="linear",verbose=True)
    clf = OneVsRestClassifier(svm.SVC(kernel="linear"),n_jobs=multiprocessing.cpu_count())
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = np.sum(y_pred == y_test) / len(y_pred)
    print("Support Vector Machine (SVM) acc: {:.2f}%".format(acc * 100))
    print(classification_report(y_test,y_pred,target_names=labels))
    return clf

In [17]:
svm = SVMClassifier(X_train, y_train, X_test, y_test)

Support Vector Machine (SVM) acc: 60.88%
              precision    recall  f1-score   support

        INFJ       0.40      0.65      0.49        66
        ENTP       0.54      0.65      0.59        89
        INTP       0.47      0.77      0.59        66
        INTJ       0.50      0.82      0.62        66
        ENTJ       0.60      0.62      0.61       104
        ENFJ       0.74      0.63      0.68       126
        INFP       0.30      0.62      0.40        52
        ENFP       0.53      0.68      0.59        84
        ISFP       0.71      0.60      0.65       128
        ISTP       0.61      0.53      0.57       125
        ISFJ       0.66      0.59      0.62       121
        ISTJ       0.76      0.66      0.71       124
        ESTP       0.83      0.60      0.70       150
        ESFP       0.58      0.51      0.55       123
        ESTJ       0.79      0.55      0.65       154
        ESFJ       0.72      0.52      0.60       150

    accuracy                           

OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='linear', max_iter=-1,
                                  probability=False, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False),
                    n_jobs=48)

In [None]:
# for train_idx, test_idx in sss.split(X, Y):
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]
#     random_forest(X_train, y_train, X_test, y_test)

In [25]:
pickle.dump(svm,open("svm.pkl","wb"))

In [31]:
# y_pred = svm.predict(X_test)
acc = np.sum(y_pred == y_test) / len(y_pred)
print("Support Vector Machine (SVM) acc: {:.2f}%".format(acc * 100))
print(classification_report(y_test,y_pred,target_names=labels))

Support Vector Machine (SVM) acc: 60.88%
              precision    recall  f1-score   support

        INFJ       0.65      0.40      0.49       108
        ENTP       0.65      0.54      0.59       108
        INTP       0.77      0.47      0.59       108
        INTJ       0.82      0.50      0.62       108
        ENTJ       0.62      0.60      0.61       108
        ENFJ       0.63      0.74      0.68       108
        INFP       0.62      0.30      0.40       108
        ENFP       0.68      0.53      0.59       108
        ISFP       0.60      0.71      0.65       108
        ISTP       0.53      0.61      0.57       108
        ISFJ       0.59      0.66      0.62       108
        ISTJ       0.66      0.76      0.71       108
        ESTP       0.60      0.83      0.70       108
        ESFP       0.51      0.58      0.55       108
        ESTJ       0.55      0.79      0.65       108
        ESFJ       0.52      0.72      0.60       108

    accuracy                           

In [26]:
data["type"].value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64