In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams

from spacy_langdetect import LanguageDetector
from spacy import Language
import spacy



from collections import defaultdict, Counter

from typing import List, Set


In [2]:
twitter_df = pd.read_csv('data/twitter_MBTI.csv')

# set proper column names
twitter_df.columns = ['user', 'tweets', 'type']

# set user as index
twitter_df.set_index('user', inplace=True)

# break up tweets single text into an array of tweets
twitter_df['tweets'] = twitter_df['tweets'].apply(lambda x: x.split('|||'))

# Extract user type dimensions (E/I, N/S, F/T, J/P) into separate columns
twitter_df['E>I'] = twitter_df['type'].apply(lambda x: x[0]=='e')
twitter_df['N>S'] = twitter_df['type'].apply(lambda x: x[1]=='n')
twitter_df['F>T'] = twitter_df['type'].apply(lambda x: x[2]=='f')
twitter_df['J>P'] = twitter_df['type'].apply(lambda x: x[3]=='j')

twitter_df.head()


Unnamed: 0_level_0,tweets,type,E>I,N>S,F>T,J>P
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,[@Pericles216 @HierBeforeTheAC @Sachinettiyil ...,intj,False,True,False,True
1,"[@Hispanthicckk Being you makes you look cute,...",intj,False,True,False,True
2,[@Alshymi Les balles sont réelles et sont tiré...,intj,False,True,False,True
3,"[I'm like entp but idiotic, Hey boy, do you wa...",intj,False,True,False,True
4,[@kaeshurr1 Give it to @ZargarShanif ... He ha...,intj,False,True,False,True


In [3]:
# NOTE: run first python -m spacy download en_core_web_sm

# @Language.factory("language_detector")
# def get_lang_detector(nlp, name):
#    return LanguageDetector()

# nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x25986753c50>

In [8]:
# create a dataset with just user corpus

user_corpus_df = twitter_df.copy()

# # drop out non-english tweets
# print(f'Number of tweets before non-english drop: {user_corpus_df["tweets"].apply(len).sum()}')
# user_corpus_df['tweets'] = user_corpus_df['tweets'].apply(
#     lambda tweets: [
#         t for t in tweets 
#         if nlp(t)._.language.get('language') == 'en'
#     ])

# print(f'Number of tweets after non-english drop: {user_corpus_df["tweets"].apply(len).sum()}')

# merge all tweets into a single corpus
user_corpus_df['n_tweets'] = user_corpus_df['tweets'].apply(lambda x: len(x))
user_corpus_df['avg_tweets_len'] = user_corpus_df['tweets'].apply(lambda x: sum([len(tweet) for tweet in x])/len(x))
user_corpus_df['corpus'] = user_corpus_df['tweets'].apply(lambda x: ' '.join(x))
user_corpus_df.drop(columns=['tweets',], inplace=True)

user_corpus_df.head()

Number of tweets before non-english drop: 1093199


KeyboardInterrupt: 

In [17]:
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
lemmatizer = WordNetLemmatizer()

def preprocess(
        text, tokenizer, lemmatizer, n=1, 
        remove_stopwords=False, remove_punctuation=False, 
        keep_sequence_and_dups=False): 

    # to lower case and tokenization
    tokens = tokenizer.tokenize(text.lower())

    # we don't always want to remove stopword and punctuation...
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    if remove_punctuation:
        tokens = [token for token in tokens if token.isalpha()]


    # lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # ngram generation 
    if keep_sequence_and_dups:
        ngrams_out = []
    else:
        ngrams_out = set()

    for i in range(1, n+1):
        new_grams = [' '.join(grams) for grams in ngrams(tokens, i)]
        if keep_sequence_and_dups:
            ngrams_out.extend(new_grams)
        else:
            ngrams_out.update(new_grams)

    return ngrams_out

def preprocess_array(x, tokenizer, lemmatizer, n=1, 
        remove_stopwords=False, remove_punctuation=False, 
        keep_sequence_and_dups=False):
    return [preprocess(
        text, tokenizer, lemmatizer, 
        n, remove_stopwords, remove_punctuation, 
        keep_sequence_and_dups) for text in x
        ]
        

In [29]:
# preprocess corpus
user_corpus_df['proc_corpus'] = user_corpus_df['corpus'].apply(
    lambda x: preprocess(
        x, tokenizer, lemmatizer, n=1, 
        keep_sequence_and_dups=True))

user_corpus_df.head()

Unnamed: 0_level_0,type,E,I,N,S,F,T,J,P,n_tweets,avg_tweets_len,corpus,proc_corpus
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,intj,False,True,True,False,False,True,True,False,179,104.865922,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,"[the, pope, is, infallible, ,, this, is, a, ca..."
1,intj,False,True,True,False,False,True,True,False,113,64.362832,@Hispanthicckk Being you makes you look cute @...,"[being, you, make, you, look, cute, on, ,, bec..."
2,intj,False,True,True,False,False,True,True,False,115,89.156522,@Alshymi Les balles sont réelles et sont tirée...,"[le, balles, sont, réelles, et, sont, tirées, ..."
3,intj,False,True,True,False,False,True,True,False,201,27.253731,"I'm like entp but idiotic Hey boy, do you want...","[i'm, like, entp, but, idiotic, hey, boy, ,, d..."
4,intj,False,True,True,False,False,True,True,False,199,42.874372,@kaeshurr1 Give it to @ZargarShanif ... He has...,"[give, it, to, ..., he, ha, pica, since, child..."


In [41]:
# take also a simpler version ignoring sequence and duplicates
user_corpus_df['proc_corpus2'] = user_corpus_df['corpus'].apply(
    lambda x: set(x)) 


In [43]:
def build_dictionary(corpus_list: list): 

    dictionary = defaultdict(set)

    for i,c in enumerate(corpus_list):
        for token in c:
            dictionary[token].add(i)

    return dictionary

dictionary = build_dictionary(user_corpus_df['proc_corpus2'])
print("Number of tokens:\t", len(dictionary.keys()))
print("Number of documents:\t", len(user_corpus_df))

Number of tokens:	 7344
Number of documents:	 7811


In [44]:
def df_entropy(df: pd.DataFrame, class_label: str) -> float: 
    """Calculate entropy of a column in a dataframe"""

    # extimate probability of each class label
    p = df[class_label].value_counts(normalize=True)

    return -sum([p_i * np.log2(p_i) for p_i in p])

H = df_entropy(user_corpus_df, 'type')
H

3.6408431572849023

In [47]:
def terms_information_gain(df: pd.DataFrame, dictionary: dict): 

    ig = dict()

    for term in dictionary.keys():
        
        # extract users who used that term 
        users = list(dictionary[term])

        # create a dataframe with just those users and one with the rest
        df_term = df.iloc[users]
        df_rest = df.drop(index=df_term.index)

        # calculate entropy of each group
        ig[term] = H - len(df_term)/len(df) * df_entropy(df_term, 'type') \
            - len(df_rest)/len(df) * df_entropy(df_rest, 'type')
        
    return ig

terms_ig = terms_information_gain(user_corpus_df, dictionary)    

In [52]:
# select best 500 terms
best_terms_ig = sorted(terms_ig.items(), key=lambda x: x[1], reverse=True)[:500]
print('\n'.join([str(t) for t in best_terms_ig[:10]]), '\n...\n')




('😂', 0.0116509812957859)
('🫶', 0.010801485264146482)
('😭', 0.009700562049179684)
('👏', 0.00784042725313272)
('🥺', 0.007725078810263408)
('🤍', 0.007499834739230327)
('✨', 0.006986911508883864)
('🥰', 0.006986395263362866)
('🤣', 0.006372115636904763)
('️', 0.006345499479745831) 
...



In [55]:
# keep only the best terms in the corpus

user_corpus_df2 = user_corpus_df.copy()

best_terms = [t[0] for t in best_terms_ig]

user_corpus_df2['proc_corpus'] = user_corpus_df2['proc_corpus'].apply(
    lambda x : [t for t in x if t in best_terms])
user_corpus_df2['proc_corpus2'] = user_corpus_df2['proc_corpus2'].apply(
    lambda x : [t for t in x if t in best_terms])

user_corpus_df2.head()

Unnamed: 0_level_0,type,E,I,N,S,F,T,J,P,n_tweets,avg_tweets_len,corpus,proc_corpus,proc_corpus2
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,intj,False,True,True,False,False,True,True,False,179,104.865922,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,"[,, ’, …, …, …, ’, “, …, ’, ’, …, ’, ’, …, ’, ...","[!, _, ”, 6, …, 👆, (, “, \n, 😎, -, "", @, 😳, ,,..."
1,intj,False,True,True,False,False,True,True,False,113,64.362832,@Hispanthicckk Being you makes you look cute @...,"[,, "", "", ,, ’, 😉, (, ,, ,, ,, ,, ,, …, …, …, ...","[!, _, 😘, 😂, ”, 6, 😉, …, (, “, \n, 😎, -, "", @,..."
2,intj,False,True,True,False,False,True,True,False,115,89.156522,@Alshymi Les balles sont réelles et sont tirée...,"[…, …, "", …, ,, ,, …, ️, (, …, "", "", !, %, ,, ...","[!, _, ✌, 6, …, ‍, 🤏, 😭, (, “, \n, ️, -, "", @,..."
3,intj,False,True,True,False,False,True,True,False,201,27.253731,"I'm like entp but idiotic Hey boy, do you want...","[,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,]","[,, ']"
4,intj,False,True,True,False,False,True,True,False,199,42.874372,@kaeshurr1 Give it to @ZargarShanif ... He has...,"[❤, ️, 🥺, ,, 🥺, ,, %, …, 🥺, ❤, ️, 🥺, 🥺, 🥺, ,, ...","[🥹, _, 😂, 6, 🤧, …, ❤, 🥺, (, 🤣, \n, ️, -, @, 🥲,..."


In [57]:

def bow_to_numeric(bow: List[Set[str]], all_words: List[str]) -> np.ndarray:
    """Turn a list of BOW to a numeric matrix"""

    features_indexes = {w:i for i,w in enumerate(all_words)}

    data = np.zeros((len(bow), len(all_words)))

    for i, doc in enumerate(bow):
        for word in doc: 
            data[i, features_indexes[word]] = 1

    return data



In [68]:
# convert bow to numeric
data = bow_to_numeric(user_corpus_df2['proc_corpus2'], best_terms)

# add n tweets and avg tweets len
data = np.hstack((data, user_corpus_df2[['n_tweets', 'avg_tweets_len']].values))

data.shape

(7811, 502)

In [76]:
# set target (the personality type)

mapping = { t:i for i,t in enumerate(user_corpus_df2['type'].unique()) }

target = np.array([mapping[t] for t in user_corpus_df2['type']])
target.shape

(7811,)

In [71]:
# model selection criteria
from sklearn.model_selection import cross_val_score

# models to apply 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [82]:
models = [
    DecisionTreeClassifier(),
    MultinomialNB(),
    RandomForestClassifier(),
    # LogisticRegression()
    ]

# cross validation

for model in models:
    scores = cross_val_score(model, data, target, cv=10)
    print(model.__class__.__name__)
    print('Mean accuracy:\t', scores.mean()) 
    print('Std dev :\t', scores.std()) 
    print('\n')


DecisionTreeClassifier
Mean accuracy:	 0.16898428468977078
Std dev :	 0.0739970209418566


MultinomialNB
Mean accuracy:	 0.15491140285095834
Std dev :	 0.006929099162506796


RandomForestClassifier
Mean accuracy:	 0.23428043265405032
Std dev :	 0.07169946437111722




In [83]:
# try predict the single features

scores_f = []

for f in ['E>I', 'N>S', 'F>T', 'J>P', ]:

    t = user_corpus_df2[f].values.astype(float)

    for model in models:
        scores_f.append(
            (f, model.__class__.__name__,
            cross_val_score(model, data, t, cv=10))
        )



In [87]:
for f, model, scores in scores_f:
    print(f, model, round(scores.mean(), 2), sep="\t")

E	DecisionTreeClassifier	0.59
E	MultinomialNB	0.62
E	RandomForestClassifier	0.68
I	DecisionTreeClassifier	0.58
I	MultinomialNB	0.62
I	RandomForestClassifier	0.68
N	DecisionTreeClassifier	0.68
N	MultinomialNB	0.69
N	RandomForestClassifier	0.79
S	DecisionTreeClassifier	0.67
S	MultinomialNB	0.69
S	RandomForestClassifier	0.78
T	DecisionTreeClassifier	0.55
T	MultinomialNB	0.56
T	RandomForestClassifier	0.62
F	DecisionTreeClassifier	0.55
F	MultinomialNB	0.56
F	RandomForestClassifier	0.62
J	DecisionTreeClassifier	0.56
J	MultinomialNB	0.57
J	RandomForestClassifier	0.61
P	DecisionTreeClassifier	0.56
P	MultinomialNB	0.57
P	RandomForestClassifier	0.61
