In [31]:
### General imports ###
import wget
import numpy as np
import pandas as pd
import re
import datetime
from operator import itemgetter
from random import randint
import seaborn as sns
import matplotlib.pyplot as plt

import os
import time
import string
import dill
import pickle
import gzip

from nltk.corpus import movie_reviews as reviews
from sklearn.datasets import fetch_20newsgroups
from gensim.models import KeyedVectors
from gensim.models import word2vec

### Natural language Toolkit ###
from nltk import *
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag
from nltk.corpus import stopwords as sw, wordnet as wn
from nltk.stem.snowball import SnowballStemmer

### Scikit-Learn ###
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix, classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split as tts
from sklearn.manifold import TSNE
from sklearn.multiclass import OneVsRestClassifier

### Tensorflow ###
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, model_from_json
from tensorflow.keras.layers import (Dense, LSTM, SpatialDropout1D, Activation, Conv1D, MaxPooling1D, 
                                     Input, concatenate, Embedding, BatchNormalization)
from tensorflow.keras.utils import to_categorical

ImportError: cannot import name 'NLTKPreprocessor' from 'nltk' (C:\ProgramData\Anaconda3\lib\site-packages\nltk\__init__.py)

# 1. Essays (Stream-of-consciousness Essays)

### 1.1 Dataset informations
Essays is a dataset created by J. W. Pennebaker and L. A. King in their paper “Linguistic styles: Language use as an individual difference” where they explore daily diaries from 15 people with substance abuse, 35 from students and journal abstracts from 40 social psychologists to determine weather or not the text written by a person could reflect their personality (https://paperswithcode.com/dataset/essays).

The identifiers for the dataset csv that we used are:
- **id** (id of the entry)
- **text** (the text associated with the entry)
- **extraversion** (y = extrovert, n = introvert)
- **neuroticism** (y = neurotic, n = tranquil)
- **agreeableness** (y = agreeable, n = disagreeable)
- **conscientiousness** (y = conscientious, n = casual)
- **openness** (y = open, n = closed)

### 1.2 Dataset exploration

In [4]:
file_path = "../Data/essays.csv"

In [5]:
data_essays = pd.read_csv(file_path, encoding = "ISO-8859-1")

In [6]:
data_essays

Unnamed: 0,id,text,extraversion,neuroticism,agreeableness,conscientiousness,openness
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y
...,...,...,...,...,...,...,...
2462,2004_493.txt,I'm home. wanted to go to bed but remembe...,n,y,n,y,n
2463,2004_494.txt,Stream of consiousnesssskdj. How do you s...,y,y,n,n,y
2464,2004_497.txt,"It is Wednesday, December 8th and a lot has be...",n,n,y,n,n
2465,2004_498.txt,"Man this week has been hellish. Anyways, now i...",n,y,n,n,y


In [None]:
# Necessary download for nltk in case it's not already downloaded
nltk.download('punkt')

In [14]:
# Some data processing
data_essays['extraversion'] = np.where(data_essays['extraversion']=='y', 1, 0)
data_essays['neuroticism'] = np.where(data_essays['neuroticism']=='y', 1, 0)
data_essays['agreeableness'] = np.where(data_essays['agreeableness']=='y', 1, 0)
data_essays['conscientiousness'] = np.where(data_essays['conscientiousness']=='y', 1, 0)
data_essays['openness'] = np.where(data_essays['openness']=='y', 1, 0)
X_essays = data_essays['text'].tolist()
y_essays = data_essays[['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']]
data_essays['text_length'] = data_essays['text'].apply(len)
labels = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']

In [12]:
# Combine and tokenize the texts
complete_corpus = ' '.join(X_essays)
words = tokenize.word_tokenize(complete_corpus)
fdist = FreqDist(words)

In [24]:
number_of_words = [len(text.split()) for text in X_essays]

print('The total number of essays: {}'.format(len(X_essays)))
print('The total number of words in all essays: {}'.format(sum(number_of_words)))
print('The average number of words in each essay: {}'.format(sum(number_of_words)/len(number_of_words)))


The total number of essays: 2467
The total number of words in all essays: 1608813
The average number of words in each essay: 652.1333603567085


In [13]:
# Preprocessed words from the essays
print("List of 100 most frequent words/counts: {}".format(fdist.most_common(100)))

List of 100 most frequent words/counts: [('I', 115487), ('.', 111178), ('to', 56263), (',', 47355), ('the', 38232), ('and', 36810), ('that', 29456), ('a', 28412), ('my', 26580), ('is', 25576), ('of', 22939), ('it', 22782), ("n't", 19996), ('in', 17828), ('do', 17448), ('have', 16166), ('me', 14588), ('so', 13099), ('but', 13060), ('this', 12054), ('be', 11724), ('for', 11520), ("'s", 11198), ('was', 10392), ('am', 10378), ('like', 10308), ('just', 10250), ('really', 10207), ('not', 10015), ("'m", 9973), ('on', 9015), ('about', 8941), ('with', 8708), ('think', 8061), ('are', 7602), ('what', 7517), ('all', 7475), ('at', 7469), ('because', 7144), ('i', 7048), ('know', 6959), ('get', 6875), ('he', 6605), ('now', 6154), ('would', 6077), ('you', 6013), ('if', 6001), ('time', 5966), ('out', 5923), ('they', 5905), ('up', 5743), ('or', 5733), ('going', 5621), ('go', 5576), ('she', 5556), ('?', 5539), ('want', 5483), ('will', 5420), ('can', 5276), ('!', 4959), ('as', 4939), ('people', 4898), ('h

In [30]:
X_preprocess = NLTKPreprocessor.transform(self.X).tolist()

NameError: name 'NLTKPreprocessor' is not defined

In [None]:
X_train, X_test, y_train, y_test = tts(X_essays, y_essays, test_size=0.2)

# Train-test split to save the dataset
with open('./Data/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)
with open('./Data/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)
with open('./Data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
with open('./Data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

# Load train and test sets from pickled lists
with open('./Data/X_train.pkl', 'rb') as pickle_file:
    X_train = pickle.load(pickle_file)
with open('./Data/X_test.pkl', 'rb') as pickle_file:
    X_test = pickle.load(pickle_file)
with open('./Data/y_train.pkl', 'rb') as pickle_file:
    y_train = pickle.load(pickle_file)
with open('./Data/y_test.pkl', 'rb') as pickle_file:
    y_test = pickle.load(pickle_file)