# Task 8 - Static Word Embedding

*by Lukas Dötlinger*

We will revisit the example from Task 7, where we are working with personality prediction.

In [52]:
import pandas as pd
import numpy as np
import time
import io

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from gensim.models import Word2Vec, KeyedVectors

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


data = pd.read_csv('res/mbti_1.csv')

def filter_text(df, remove_labels=False):
    start_time = time.perf_counter()

    labels = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
    lower_labels = [ l.lower() for l in labels ]
    stopword_set = set(stopwords.words('english'))

    # Convert posts to lowercase.
    df['posts'] = df['posts'].apply(lambda s: s.lower())

    stop_time = time.perf_counter()
    print(f"Lowering took {stop_time - start_time:0.4f} seconds")
    start_time = stop_time

    # Word tokenize posts.
    df['posts'] = df['posts'].apply(lambda s: word_tokenize(s))

    stop_time = time.perf_counter()
    print(f"Tokenizing took {stop_time - start_time:0.4f} seconds")
    start_time = stop_time

    # Remove non-alpha words.
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if w.isalpha() ])
    # Remove personality labels.
    if remove_labels:
        df['posts'] = df['posts'].apply(lambda s: [ w for w in s if w not in lower_labels ])
    #Remove very short or long words.
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if len(w) > 3 ]) 
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if len(w) < 30 ])
    #Remove stopwords.
    df['posts'] = df['posts'].apply(lambda s: [ w for w in s if w not in stopword_set ])

    stop_time = time.perf_counter()
    print(f"Filtering took {stop_time - start_time:0.4f} seconds")
    start_time = stop_time

    # Join words to one string.
    df['posts'] = df['posts'].apply(lambda s: ' '.join(s))

    stop_time = time.perf_counter()
    print(f"Joining to string took {stop_time - start_time:0.4f} seconds")

    return df

processed_df = filter_text(data)
processed_df.head()

Lowering took 0.1157 seconds
Tokenizing took 58.2224 seconds
Filtering took 3.6057 seconds
Joining to string took 0.2370 seconds


Unnamed: 0,type,posts
0,INFJ,intj moments https sportscenter plays https ex...
1,ENTP,finding lack posts boring position often examp...
2,INTP,https course know blessing absolutely positive...
3,INTJ,intp enjoyed conversation esoteric gabbing nat...
4,ENTJ,another silly misconception approaching logica...


In [53]:
encoder = LabelEncoder()
processed_df['encoding'] = encoder.fit_transform(processed_df['type'])

target = processed_df['encoding']

# Filter stopwords from nltk in vectorization step.
vectorizer = CountVectorizer() 
source = vectorizer.fit_transform(processed_df['posts'])
source.shape

(8675, 84182)

In [54]:
def split_posts(df):
    df['posts'] = df['posts'].apply(lambda s: word_tokenize(s))
    return df

processed_df = split_posts(processed_df)
processed_df.head()

Unnamed: 0,type,posts,encoding
0,INFJ,"[intj, moments, https, sportscenter, plays, ht...",8
1,ENTP,"[finding, lack, posts, boring, position, often...",3
2,INTP,"[https, course, know, blessing, absolutely, po...",11
3,INTJ,"[intp, enjoyed, conversation, esoteric, gabbin...",10
4,ENTJ,"[another, silly, misconception, approaching, l...",2


The *English Wikipedia Dump of February 2017* pre-trained word embedding model was used for this task. It can be found [here](http://vectors.nlpl.eu/repository/).


In [50]:
wv = KeyedVectors.load_word2vec_format('res/bnt/model.bin', binary=True)

def document_vector(doc):
    doc = [ word for word in doc if wv.has_index_for(word) ]
    if not doc:
        doc.append('empty')
    mean_f = np.mean(wv[doc], axis=0)
    max_f = np.max(wv[doc], axis=0)
    min_f = np.min(wv[doc], axis=0)
    return np.concatenate((max_f, min_f, mean_f))

source_data = np.matrix([document_vector(words) for words in processed_df['posts']])
source_data.shape

(8675, 900)

In [55]:
from scipy.sparse import hstack

combined = hstack([source, source_data], format="csr")
combined.shape

(8675, 85082)

In [58]:
kf = KFold(n_splits=10)
results = {
    'accuracies': [],
    'f1-macro': [],
    'f1-micro': []
}

for train_index, test_index in kf.split(combined):
    x_train, x_test = combined[train_index], combined[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # Logistic Regression
    logreg = LogisticRegression(solver='saga', max_iter=100, C=100)
    logreg.fit(x_train, y_train)

    y_pred = logreg.predict(x_test)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    results['accuracies'].append(accuracy_score(y_test, predictions))
    results['f1-macro'].append(f1_score(y_test, y_pred, average='macro'))
    results['f1-micro'].append(f1_score(y_test, y_pred, average='micro'))
    print('----------------------------------------------------------------')
    print('Accuracy: %.4f%%' % (sum(float(a) for a in results['accuracies']) / float(len(results['accuracies']))))
    print('F1-macro: %.4f%%' % (sum(float(f) for f in results['f1-macro']) / float(len(results['f1-macro']))))
    print('F1-micro: %.4f%%' % (sum(float(f) for f in results['f1-micro']) / float(len(results['f1-micro']))))

print('----------------------------------------------------------------')
print('Accuracy: %.4f%%' % (sum(float(a) for a in results['accuracies']) / float(len(results['accuracies']))))
print('F1-macro: %.4f%%' % (sum(float(f) for f in results['f1-macro']) / float(len(results['f1-macro']))))
print('F1-micro: %.4f%%' % (sum(float(f) for f in results['f1-micro']) / float(len(results['f1-micro']))))


----------------------------------------------------------------
Accuracy: 0.5876%
F1-macro: 0.4230%
F1-micro: 0.5876%
----------------------------------------------------------------
Accuracy: 0.5991%
F1-macro: 0.4286%
F1-micro: 0.5991%
----------------------------------------------------------------
Accuracy: 0.5972%
F1-macro: 0.4404%
F1-micro: 0.5972%
----------------------------------------------------------------
Accuracy: 0.6066%
F1-macro: 0.4470%
F1-micro: 0.6066%
----------------------------------------------------------------
Accuracy: 0.6069%
F1-macro: 0.4581%
F1-micro: 0.6069%
----------------------------------------------------------------
Accuracy: 0.6084%
F1-macro: 0.4621%
F1-micro: 0.6084%
----------------------------------------------------------------
Accuracy: 0.6100%
F1-macro: 0.4700%
F1-micro: 0.6100%
----------------------------------------------------------------
Accuracy: 0.6094%
F1-macro: 0.4652%
F1-micro: 0.6094%
------------------------------------------------

Using 10-Fold cross validation, we can observe the following results:

- Accuracy: 0.6075%
- F1-macro: 0.4719%
- F1-micro: 0.6075%

The results look very good overall if we take into account that there are 16 different MBTI types.
Compared to the previous task, we can only see a very small improvement in the F1-macro score when using word embedding.