# Task 7 - Personality Prediction

*by Lukas Dötlinger*


In [24]:
import pandas as pd
import numpy as np
import time

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


data = pd.read_csv('res/mbti_1.csv')

def filter_text(df):
    start_time = time.perf_counter()

    labels = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
    lower_labels = [ l.lower() for l in labels ]

    # Convert posts to lowercase.
    df['posts'] = df['posts'].apply(lambda s: s.lower())

    stop_time = time.perf_counter()
    print(f"Lowering took {stop_time - start_time:0.4f} seconds")
    start_time = stop_time

    # Word tokenize posts.
    df['posts'] = df['posts'].apply(lambda s: word_tokenize(s))

    stop_time = time.perf_counter()
    print(f"Tokenizing took {stop_time - start_time:0.4f} seconds")
    start_time = stop_time

    # Remove non-alpha words and labels from posts.
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if w.isalpha() and w not in lower_labels ])
    #Remove very short or long words
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if len(w) > 3 ]) 
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if len(w) < 30 ])

    stop_time = time.perf_counter()
    print(f"Filtering took {stop_time - start_time:0.4f} seconds")
    start_time = stop_time

    # Join words to one string.
    df['posts'] = df['posts'].apply(lambda s: ' '.join(s))

    stop_time = time.perf_counter()
    print(f"Joining to string took {stop_time - start_time:0.4f} seconds")

    return df

new_df = filter_text(data)
new_df.head()

Lowering took 0.0950 seconds
Tokenizing took 58.0212 seconds
Filtering took 3.3629 seconds
Joining to string took 0.2256 seconds


Unnamed: 0,type,posts
0,INFJ,moments https sportscenter plays https been mo...
1,ENTP,finding lack these posts very boring same posi...
2,INTP,https course which know that blessing being ab...
3,INTJ,enjoyed conversation other esoteric gabbing ab...
4,ENTJ,another silly misconception that approaching l...


In [33]:
encoder = LabelEncoder()
new_df['type of encoding'] = encoder.fit_transform(new_df['type'])
target = new_df['type of encoding']

# Filter stopwords from nltk in vectorization step.
vectorizer = CountVectorizer(stop_words='english') 
train = vectorizer.fit_transform(new_df['posts'])

train.shape

(8675, 84013)

In [32]:
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(train):
    accuracies = {}

    x_train, x_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # Logistic Regression
    logreg = LogisticRegression(solver='sag', max_iter=100)
    logreg.fit(x_train, y_train)

    Y_pred = logreg.predict(x_test)
    predictions = [round(value) for value in Y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    accuracies['Logistic Regression'] = accuracy* 100.0
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("F1 (macro): %.2f%%" % f1_score(y_test, Y_pred, average='macro'))
    print("F1 (micro): %.2f%%" % f1_score(y_test, Y_pred, average='micro'))



Accuracy: 45.16%
F1 (macro): 0.24%
F1 (micro): 0.45%
Accuracy: 46.31%
F1 (macro): 0.27%
F1 (micro): 0.46%
Accuracy: 43.78%
F1 (macro): 0.29%
F1 (micro): 0.44%
Accuracy: 46.66%
F1 (macro): 0.27%
F1 (micro): 0.47%
Accuracy: 43.66%
F1 (macro): 0.28%
F1 (micro): 0.44%
Accuracy: 45.91%
F1 (macro): 0.31%
F1 (micro): 0.46%
Accuracy: 44.87%
F1 (macro): 0.31%
F1 (micro): 0.45%
Accuracy: 46.02%
F1 (macro): 0.31%
F1 (micro): 0.46%
Accuracy: 46.94%
F1 (macro): 0.31%
F1 (micro): 0.47%
Accuracy: 43.14%
F1 (macro): 0.23%
F1 (micro): 0.43%
