In [1]:
import pandas as pd
import numpy as np 
import re
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
import string
from emoji import UNICODE_EMOJI
from spellchecker import SpellChecker
from textblob import TextBlob
from textblob import Word
spell = SpellChecker()
nltk.download('brown')
from string import punctuation
from collections import Counter
#from keras.preprocessing.text import text_to_word_sequence, one_hot, hashing_trick, Tokenizer
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\iamch\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


# Data Prep

We start by loading the data, splitting the comments to be individual data points. 


In [2]:
mbti_data_original = pd.read_csv('../Data/mbti_data.csv')
yelp_data = pd.read_csv('../Data.yelp_comments_reduced_2.csv')
print(mbti_data_original.shape)
print(yelp_data.shape)

(8675, 2)
(65109, 3)


In [3]:
mbti_split = mbti_data_original



In [4]:
p_types = mbti_split['type'].unique()


In [5]:
stop = stopwords.words('english')

def avg_word(sentence):
  words = sentence.split()
  if len(words) > 0:
      return round(sum(len(word) for word in words)/len(words),2)
  return 0

def get_misspelled_details(split_word, word_count):
    misspelled = spell.unknown(split_word)
    if len(misspelled) > 0:
        average_length_misspelled = round(sum(len(word) for word in misspelled)/len(misspelled),2)
    else:
        average_length_misspelled = -1
    return(len(misspelled) / word_count), average_length_misspelled
    
def lemmatise_n_spell_check(word_list):
   # return [Word(spell.correction(word)).lemmatize() for word in word_list]
    return [Word(word).lemmatize() for word in word_list]

def remove_punctuation(entry):
    tab = str.maketrans(dict.fromkeys(string.punctuation))
    return entry.translate(tab)  

def count_emoji(word_list):
    emoji_count = 0
    for emoji in UNICODE_EMOJI:
        emoji_count += word_list.count(emoji)
    return emoji_count

def get_word_probabilities(split_word):
    word_probs = np.array([spell.word_probability(x) for x in split_word])
    word_probabilities = word_probs[word_probs > 0.00001]
    if len(word_probabilities) == 0:
        word_probabilities = [0]
    max_word_prob = max(word_probabilities)
    average_word_prob = np.mean(word_probabilities)
    lowest_word_prob = min(word_probabilities)
    std_word_prob = np.std(word_probabilities)
    return max_word_prob, average_word_prob, lowest_word_prob, std_word_prob
    
def get_sentiments(doc):
    
    text_blob = TextBlob(doc)
    polarity = []
    subjectivity = [] 
    for sentence in text_blob.sentences:
        polarity.append(sentence.sentiment.polarity)
        subjectivity.append(sentence.sentiment.subjectivity)

    
    return max(polarity), np.mean(polarity), min(polarity), max(subjectivity), np.mean(subjectivity), min(subjectivity)

# turn a doc into clean tokens
def clean_comment(doc):
    for types in p_types:
        doc = re.sub(types,  'type',    doc)  
        doc = re.sub(types.lower(), 'type', doc)
    
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1))) ##counter

    doc = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', doc, flags=re.MULTILINE)
    doc = re.sub('[|]', '', doc)
    max_polarity, average_polarity, min_polarity,  max_subjectivity, avg_subjectivity, min_subjectivity = get_sentiments(doc)
   
    punctuation_count = count(doc, string.punctuation)
    split_word = doc.split()
    word_count = len(split_word)
    char_count = len(doc)
    av_word = avg_word(doc)
    
    max_word_prob, average_word_prob, lowest_word_prob, std_word_prob = get_word_probabilities(split_word)

    
    emoji_count = count_emoji(doc)
    
    numerics = len([x for x in split_word if x.isdigit()])  
    stop_words = len([x for x in split_word if x in stop])
    upper = len([x for x in split_word if x.isupper() & len(x) > 1])
    

    doc = remove_punctuation(doc)
    tokens = re.sub("[^\w]", " ",  doc).split()
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [w for w in tokens if not w in stop] 

    percentage_misspelled, avg_misspelled = get_misspelled_details(split_word, word_count)
    tokens = lemmatise_n_spell_check(tokens)

    return {'tokens' : tokens, 
            'max_polarity' : round(max_polarity,3),
            'average_polarity' : round(average_polarity,3),
            'min_polarity' : round(min_polarity,2),
            'max_subjectivity' : round(max_subjectivity,3),
            'average_subjectivity' : round(avg_subjectivity,3),
            'min_subjectivity' : round(min_subjectivity,3), 
            'misspelled%' : round(percentage_misspelled,3),
            'average_misspelled_word_length' : round(avg_misspelled,3),
            'emoji_%' : round(emoji_count / char_count, 3),
            'puncutation_%' : round(punctuation_count / char_count,3),
            'average word length' : av_word,
            'highest_word_probability' : round(max_word_prob,3),
            'average_word_probability' : round(average_word_prob,3),
            'lowest_word_probability' : round(lowest_word_prob,5),
            'std_word_probability' : round(std_word_prob,3),
            'number %' : round(numerics / word_count,3),
            'stop word %' : round(stop_words / word_count, 3),
            'upper word %' : round(upper / word_count ,3)}

In [6]:
#clean_comment(' I am so sad. This is truely terible and yet. I still have 1 or maybe even 2 emojis (bt I cant spell so well) 😁') 
clean_comment(mbti_split.iloc[4015]['posts'])
clean_comment(mbti_split.iloc[4016]['posts'])
clean_comment(mbti_split.iloc[4014]['posts'])

{'average word length': 4.46,
 'average_misspelled_word_length': 7.04,
 'average_polarity': 0.071,
 'average_subjectivity': 0.389,
 'average_word_probability': 0.009,
 'emoji_%': 0.0,
 'highest_word_probability': 0.064,
 'lowest_word_probability': 1e-05,
 'max_polarity': 1.0,
 'max_subjectivity': 1.0,
 'min_polarity': -0.6,
 'min_subjectivity': 0.0,
 'misspelled%': 0.219,
 'number %': 0.001,
 'puncutation_%': 0.055,
 'std_word_probability': 0.015,
 'stop word %': 0.446,
 'tokens': ['day',
  'look',
  'back',
  'laugh',
  'pretty',
  'insane',
  'strong',
  'depression',
  'fueled',
  'hatred',
  'anger',
  'bei',
  'discovered',
  'im',
  'really',
  'much',
  'type',
  'thought',
  'think',
  'go',
  'back',
  'lurking',
  'every',
  'best',
  'song',
  'eversometimes',
  'think',
  'im',
  'type',
  'im',
  'like',
  'others',
  'dont',
  'think',
  'fit',
  'anywhereusually',
  'laying',
  'back',
  'nice',
  'get',
  'yes',
  'handful',
  'time',
  'people',
  'didnt',
  'like',
  

We next split the comments into word vectors, find the total number of unique words then use the md5 hash function to create an integer hash of each of the words 

In [7]:
cleaned_mbti = mbti_split['posts'].apply(clean_comment)
cleaned_yelp = yelp_data['text'].apply(clean_comment)



In [8]:
def data_frame_the_cleaned_data(data, cleaned):
    data['posts'] = [item['tokens'] for item in cleaned]
    data['posts'].replace('[]', np.nan, inplace=True)
    data['joined_comment'] = data['posts'].apply(lambda x: " ".join(x))
    data.dropna(subset=['posts'], inplace=True)

    data['max_polarity'] = [item['max_polarity'] for item in cleaned]
    data['average_polarity'] = [item['average_polarity'] for item in cleaned]
    data['min_polarity'] = [item['min_polarity'] for item in cleaned]

    data['max_subjectivity'] = [item['max_subjectivity'] for item in cleaned]
    data['average_subjectivity'] = [item['average_subjectivity'] for item in cleaned]
    data['min_subjectivity'] = [item['min_subjectivity'] for item in cleaned]

    data['misspelled%'] = [item['misspelled%'] for item in cleaned]
    data['average_misspelled_word_length'] = [item['average_misspelled_word_length'] for item in cleaned]
    data['emoji_%'] = [item['emoji_%'] for item in cleaned]
    data['puncutation_%'] = [item['puncutation_%'] for item in cleaned]

    data['average word length'] = [item['average word length'] for item in cleaned]
    data['highest_word_probability'] = [item['highest_word_probability'] for item in cleaned]
    data['average_word_probability'] = [item['average_word_probability'] for item in cleaned]
    data['std_word_probability'] = [item['std_word_probability'] for item in cleaned]

    data['number %'] = [item['number %'] for item in cleaned]
    data['stop word %'] = [item['stop word %'] for item in cleaned]
    data[ 'upper word %'] = [item[ 'upper word %'] for item in cleaned]
    return data

In [9]:
mbti_split = data_frame_the_cleaned_data(mbti_split, cleaned_mbti) 
yelp_data = data_frame_the_cleaned_data(yelp_data, cleaned_yelp) 


In [10]:

print(mbti_split.head())

   type                                              posts  \
0  INFJ  [type, moment, sportscenter, top, ten, play, p...   
1  ENTP  [im, finding, lack, post, alarmingsex, boring,...   
2  INTP  [good, one, course, say, know, thats, blessing...   
3  INTJ  [dear, type, enjoyed, conversation, day, esote...   
4  ENTJ  [youre, firedthats, another, silly, misconcept...   

                                      joined_comment  max_polarity  \
0  type moment sportscenter top ten play prankswh...          0.55   
1  im finding lack post alarmingsex boring positi...          1.00   
2  good one course say know thats blessing cursed...          1.00   
3  dear type enjoyed conversation day esoteric ga...          0.80   
4  youre firedthats another silly misconception a...          1.00   

   average_polarity  min_polarity  max_subjectivity  average_subjectivity  \
0             0.089          -0.5               1.0                 0.400   
1             0.123          -1.0               1.0 

In [11]:
b_o_w_vec = CountVectorizer(max_features=5000,  lowercase=True, ngram_range=(1,1), analyzer = "word")
b_o_w_mbti = b_o_w_vec.fit_transform(mbti_split['joined_comment'])
b_o_w_yelp = b_o_w_vec.transform(yelp_data['joined_comment'])

In [12]:
print(b_o_w_vec.vocabulary_)



In [13]:
b_o_w_processed_mbti = pd.DataFrame(b_o_w_mbti.toarray(), columns = b_o_w_vec.get_feature_names())
b_o_w_processed_yelp = pd.DataFrame(b_o_w_yelp.toarray(), columns = b_o_w_vec.get_feature_names())


In [14]:
def split_type(x):
    return [x[0], x[1], x[2], x[3]]


In [15]:
t = mbti_split['type'].apply(lambda x : split_type(x))
mbti_split['t1'] = [item[0] for item in t]
mbti_split['t2'] = [item[1] for item in t]
mbti_split['t3'] = [item[2] for item in t]
mbti_split['t4'] = [item[3] for item in t]

mbti_split.head(1000)

Unnamed: 0,type,posts,joined_comment,max_polarity,average_polarity,min_polarity,max_subjectivity,average_subjectivity,min_subjectivity,misspelled%,...,highest_word_probability,average_word_probability,std_word_probability,number %,stop word %,upper word %,t1,t2,t3,t4
0,INFJ,"[type, moment, sportscenter, top, ten, play, p...",type moment sportscenter top ten play prankswh...,0.550,0.089,-0.50,1.000,0.400,0.0,0.217,...,0.064,0.009,0.016,0.002,0.403,0.0,I,N,F,J
1,ENTP,"[im, finding, lack, post, alarmingsex, boring,...",im finding lack post alarmingsex boring positi...,1.000,0.123,-1.00,1.000,0.394,0.0,0.193,...,0.064,0.009,0.015,0.001,0.398,0.0,E,N,T,P
2,INTP,"[good, one, course, say, know, thats, blessing...",good one course say know thats blessing cursed...,1.000,0.122,-1.00,1.000,0.444,0.0,0.222,...,0.064,0.008,0.014,0.006,0.371,0.0,I,N,T,P
3,INTJ,"[dear, type, enjoyed, conversation, day, esote...",dear type enjoyed conversation day esoteric ga...,0.800,0.077,-0.50,1.000,0.359,0.0,0.202,...,0.064,0.009,0.015,0.005,0.405,0.0,I,N,T,J
4,ENTJ,"[youre, firedthats, another, silly, misconcept...",youre firedthats another silly misconception a...,1.000,0.045,-0.80,1.000,0.379,0.0,0.201,...,0.064,0.008,0.014,0.001,0.426,0.0,E,N,T,J
5,INTJ,"[science, perfect, scientist, claim, scientifi...",science perfect scientist claim scientific inf...,1.000,0.041,-1.00,1.000,0.352,0.0,0.206,...,0.064,0.008,0.013,0.001,0.424,0.0,I,N,T,J
6,INFJ,"[cant, draw, nail, haha, done, professional, n...",cant draw nail haha done professional nail yes...,0.800,0.076,-0.50,1.000,0.376,0.0,0.188,...,0.064,0.007,0.013,0.007,0.406,0.0,I,N,F,J
7,INTJ,"[tend, build, collection, thing, desktop, use,...",tend build collection thing desktop use freque...,0.700,0.060,-0.31,0.744,0.272,0.0,0.147,...,0.064,0.008,0.014,0.002,0.463,0.0,I,N,T,J
8,INFJ,"[im, sure, thats, good, question, distinction,...",im sure thats good question distinction two de...,0.875,0.179,-0.39,1.000,0.495,0.0,0.224,...,0.064,0.009,0.015,0.000,0.413,0.0,I,N,F,J
9,INTP,"[position, actually, let, go, person, due, var...",position actually let go person due various re...,1.000,0.149,-0.50,1.000,0.403,0.0,0.185,...,0.064,0.009,0.015,0.002,0.442,0.0,I,N,T,P


In [16]:
mbti_processed = pd.concat([mbti_split, b_o_w_processed_mbti ], axis=1, sort=False)
yelp_processed = pd.concat([yelp_data, b_o_w_processed_yelp ], axis=1, sort=False)

In [18]:
yelp_processed.shape

(65109, 5022)

In [17]:
mbti_processed.to_csv('../Data/mbti_pre-processed.csv')
yelp_processed.to_csv('../Data/yelp_pre-processed_2.csv')

In [None]:
mbti_E = mbti_processed[mbti_processed['t1'] == 'E']
mbti_I = mbti_processed[mbti_processed['t1'] == 'I']
mbti_N = mbti_processed[mbti_processed['t2'] == 'N']
mbti_S = mbti_processed[mbti_processed['t2'] == 'S']
mbti_F = mbti_processed[mbti_processed['t3'] == 'F']
mbti_T = mbti_processed[mbti_processed['t3'] == 'T']
mbti_P = mbti_processed[mbti_processed['t4'] == 'P']
mbti_J = mbti_processed[mbti_processed['t4'] == 'J']

e_i_count = min(len(mbti_E), len(mbti_I))
n_s_count = min(len(mbti_N), len(mbti_S))
f_t_count = min(len(mbti_F), len(mbti_T))
p_j_count = min(len(mbti_P), len(mbti_J))

print(e_i_count, n_s_count, f_t_count, p_j_count)

In [None]:
mbti_EI = pd.concat([mbti_E.sample(e_i_count), mbti_I.sample(e_i_count)], ignore_index=True)
mbti_EI_target = mbti_EI['t1']
mbti_EI = mbti_EI.drop(['type', 'posts', 'joined_comment', 't1', 't2', 't3', 't4'], axis = 1)

mbti_NS = pd.concat([mbti_N.sample(n_s_count), mbti_S.sample(n_s_count)],ignore_index=True)
mbti_NS_target = mbti_NS['t2']
mbti_NS = mbti_NS.drop(['type', 'posts', 'joined_comment', 't1', 't2', 't3', 't4'], axis = 1)


mbti_FT = pd.concat([mbti_F.sample(f_t_count), mbti_T.sample(f_t_count)],ignore_index=True)
mbti_FT_target = mbti_FT['t3']
mbti_FT = mbti_FT.drop(['type', 'posts', 'joined_comment', 't2', 't1', 't3', 't4'], axis = 1)

mbti_PJ = pd.concat([mbti_P.sample(p_j_count), mbti_J.sample(p_j_count)],ignore_index=True)
mbti_PJ_target = mbti_PJ['t4']
mbti_PJ = mbti_PJ.drop(['type', 'posts', 'joined_comment', 't2', 't3', 't1', 't4'], axis = 1)

In [None]:
mbti_EI.head()

In [None]:
mbti_EI_train, mbti_EI_test, mbti_EI_train_target, mbti_EI_test_target = train_test_split(mbti_EI, mbti_EI_target,test_size = 0.2)
mbti_NS_train, mbti_NS_test, mbti_NS_train_target, mbti_NS_test_target = train_test_split(mbti_NS, mbti_NS_target, test_size = 0.2)
mbti_FT_train, mbti_FT_test, mbti_FT_train_target, mbti_FT_test_target = train_test_split(mbti_FT,mbti_FT_target, test_size = 0.2)
mbti_PJ_train, mbti_PJ_test, mbti_PJ_train_target, mbti_PJ_test_target = train_test_split(mbti_PJ, mbti_PJ_target, test_size = 0.2)

In [22]:
mbti_EI_train.to_csv('../Data/traintest/mbti_EI_train')
mbti_EI_test.to_csv('../Data/traintest/mbti_EI_test')
mbti_EI_train_target.to_csv('../Data/traintest/mbti_EI_train_target')
mbti_EI_test_target.to_csv('../Data/traintest/mbti_EI_test_target')

mbti_NS_train.to_csv('../Data/traintest/mbti_NS_train')
mbti_NS_test.to_csv('../Data/traintest/mbti_NS_test')
mbti_NS_train_target.to_csv('../Data/traintest/mbti_NS_train_target')
mbti_NS_test_target.to_csv('../Data/traintest/mbti_NS_test_target')

mbti_FT_train.to_csv('../Data/traintest/mbti_FT_train')
mbti_FT_test.to_csv('../Data/traintest/mbti_FT_test')
mbti_FT_train_target.to_csv('../Data/traintest/mbti_FT_train_target')
mbti_FT_test_target.to_csv('../Data/traintest/mbti_FT_test_target')

mbti_PJ_train.to_csv('../Data/traintest/mbti_PJ_train')
mbti_PJ_test.to_csv('../Data/traintest/mbti_PJ_test')
mbti_PJ_train_target.to_csv('../Data/traintest/mbti_PJ_train_target')
mbti_PJ_test_target.to_csv('../Data/traintest/mbti_PJ_test_target')

In [23]:
mbti_PJ_train.shape

(5494, 5016)