# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd

# feature engineering
import re
import nltk

# vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# performance check
import time

# code formatter
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# reading the clean_data_2 file
personality_data = pd.read_csv("data_ekta/clean_data_2.csv")

<IPython.core.display.Javascript object>

In [3]:
personality_data.columns

Index(['type', 'is_Extrovert', 'is_Sensing', 'is_Thinking', 'is_Judging',
       'posts', 'clean_posts', 'compound_sentiment', 'pos_sentiment',
       'neg_sentiment',
       ...
       'S_PRT_med', 'S_PRT_std', 'S_PRON_med', 'S_PRON_std', 'S_VERB_med',
       'S_VERB_std', 'S_._med', 'S_._std', 'S_X_med', 'S_X_std'],
      dtype='object', length=127)

<IPython.core.display.Javascript object>

In [4]:
# lookign at the top 5 rows of the dataset
personality_data.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,S_PRT_med,S_PRT_std,S_PRON_med,S_PRON_std,S_VERB_med,S_VERB_std,S_._med,S_._std,S_X_med,S_X_std
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,youtube moment youtube sportscenter top t...,0.997599,0.352861,0.292958,...,0.0,0.235294,1.0,1.719016,2.0,2.90403,2.5,1.911538,0.0,0.0
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...,0.99925,0.361035,0.349296,...,0.0,0.362433,4.0,3.036079,6.0,3.768878,4.0,1.995551,0.0,0.147406
2,INTP,0,0,1,0,'Good one _____ https://www.youtube.com/wat...,good one youtube course say know thats ble...,0.9993,0.399183,0.315493,...,0.0,0.20608,2.0,2.091252,4.0,3.347673,3.0,2.468968,0.0,0.284583
3,INTJ,0,0,1,1,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gab...,0.9991,0.311989,0.259155,...,0.0,0.199826,3.0,2.547381,5.0,3.856587,3.0,2.005093,0.0,0.199826
4,ENTJ,1,0,1,1,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...,0.974042,0.318801,0.546479,...,0.0,0.284583,3.0,2.122891,5.0,3.964285,3.0,2.339568,0.0,0.20608


<IPython.core.display.Javascript object>

In [5]:
# checking the number of rows and columns
personality_data.shape

(8675, 127)

<IPython.core.display.Javascript object>

### Feature Engineering - III

#### COUNTING

#### Question/Exclamation/Colon/Emoji Count

In [6]:
def unique_words(s):
    unique = set(s.split(" "))
    return len(unique)


def emojis(post):
    # does not include emojis made purely from symbols, only :word:
    emoji_count = 0
    words = post.split()
    for e in words:
        if "http" not in e:
            if e.count(":") == 2:
                emoji_count += 1
    return emoji_count


def colons(post):
    # Includes colons used in emojis
    colon_count = 0
    words = post.split()
    for e in words:
        if "http" not in e:
            colon_count += e.count(":")
    return colon_count

<IPython.core.display.Javascript object>

In [7]:
personality_data["qm"] = personality_data["posts"].apply(lambda s: s.count("?"))
personality_data["em"] = personality_data["posts"].apply(lambda s: s.count("!"))
personality_data["colons"] = personality_data["posts"].apply(colons)
personality_data["emojis"] = personality_data["posts"].apply(emojis)

<IPython.core.display.Javascript object>

#### Word Count

In [8]:
personality_data["word_count"] = personality_data["posts"].apply(
    lambda s: s.count(" ") + 1
)
personality_data["unique_words"] = personality_data["posts"].apply(unique_words)

<IPython.core.display.Javascript object>

#### Word Stats

* CAUTION - This will take Long !!

In [9]:
# stats

t = time.time()

personality_data["avg_word_ct"] = personality_data["word_count"].apply(lambda s: s / 50)

personality_data["post_length_var"] = personality_data["posts"].apply(
    lambda x: np.var([len(post.split()) for post in x.split("|||")])
)

personality_data["med_char"] = personality_data["tagged_words"].apply(
    lambda x: np.median([len(i) for i in x])
)

personality_data["med_word"] = personality_data["tagged_words"].apply(
    lambda x: np.median([len(i.split()) for i in x])
)

print(f"Time Taken: {time.time() - t}")

Time Taken: 112.10022401809692


<IPython.core.display.Javascript object>

#### Upper Case Count

In [10]:
personality_data["upper"] = personality_data["posts"].apply(
    lambda x: len([x for x in x.split() if x.isupper()])
)

<IPython.core.display.Javascript object>

#### Link Count

In [11]:
personality_data["link_count"] = personality_data["posts"].apply(
    lambda s: s.count("http")
)

<IPython.core.display.Javascript object>

#### Ellipses Count

In [12]:
ellipses_count = [
    len(re.findall(r"\.\.\.\ ", posts)) for posts in personality_data["posts"]
]
personality_data["ellipses"] = ellipses_count

<IPython.core.display.Javascript object>

#### Image Count

In [13]:
personality_data["img_count"] = [
    len(re.findall(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.png)", post))
    for post in personality_data["posts"]
]

<IPython.core.display.Javascript object>

In [14]:
personality_data.head(2)

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,word_count,unique_words,avg_word_ct,post_length_var,med_char,med_word,upper,link_count,ellipses,img_count
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,youtube moment youtube sportscenter top t...,0.997599,0.352861,0.292958,...,578,376,11.56,135.29,1.0,1.0,13,24,7,7
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...,0.99925,0.361035,0.349296,...,1194,596,23.88,187.4756,1.0,1.0,82,10,0,8


<IPython.core.display.Javascript object>

In [15]:
# checking the data types to make sure they still look good
personality_data.dtypes

type             object
is_Extrovert      int64
is_Sensing        int64
is_Thinking       int64
is_Judging        int64
                 ...   
med_word        float64
upper             int64
link_count        int64
ellipses          int64
img_count         int64
Length: 141, dtype: object

<IPython.core.display.Javascript object>

In [16]:
# checking for null values again
personality_data.isnull().sum()

type            0
is_Extrovert    0
is_Sensing      0
is_Thinking     0
is_Judging      0
               ..
med_word        0
upper           0
link_count      0
ellipses        0
img_count       0
Length: 141, dtype: int64

<IPython.core.display.Javascript object>

#### Dropping posts with less than 2 words

In [17]:
# personality_data.drop(
#     personality_data[personality_data["word_count"] < 2].index, inplace=True
# )

<IPython.core.display.Javascript object>

In [18]:
# personality_data.reset_index(drop=True, inplace=True)

<IPython.core.display.Javascript object>

In [19]:
# personality_data.head(2)

<IPython.core.display.Javascript object>

In [20]:
# Saving the data with counts
personality_data.to_csv("data_ekta/clean_data_3.csv", index=False)

<IPython.core.display.Javascript object>

### Vectorize

In [27]:
# Using TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    min_df=0.05, max_df=0.85, analyzer="word", ngram_range=(1, 2), max_features=1500
)
tfidf_words = tfidf_vectorizer.fit_transform(personality_data["clean_posts"])

tfidf_vectorized_data = pd.DataFrame(
    data=tfidf_words.toarray(), columns=tfidf_vectorizer.get_feature_names()
)

<IPython.core.display.Javascript object>

In [28]:
tfidf_vectorized_data.head()

Unnamed: 0,ability,able,absolutely,abstract,accept,according,account,accurate,across,act,...,youd,youll,young,younger,youre,youre right,youtube,youtube youtube,youve,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.028597,0.0,0.551125,0.185773,0.048938,0.0
1,0.0,0.038241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.103125,0.0,0.031054,0.0,0.0,0.0
2,0.122044,0.044139,0.106094,0.0,0.0,0.0,0.0,0.063832,0.0,0.0,...,0.0,0.0,0.0,0.0,0.089273,0.075764,0.107532,0.0,0.0,0.081498
3,0.0,0.070525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.045747,0.0,0.0,0.142639,0.0,0.057271,0.0,0.081367,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.027405,0.0,0.03301,0.0,0.0,0.0


<IPython.core.display.Javascript object>

In [29]:
# Saving the TF-IDF vectorized data
tfidf_vectorized_data.to_csv("data_ekta/tfidf_vectorized_data.csv", index=False)

<IPython.core.display.Javascript object>

In [24]:
# Using CountVectorizer

count_vectorizer = CountVectorizer(
    analyzer="word",
    stop_words="english",
    input="content",
    decode_error="ignore",
    min_df=0.05,
    max_df=0.90,
    token_pattern=r"\w{1,}",
    max_features=2000,
    ngram_range=(1, 2),
)

count_words = count_vectorizer.fit_transform(personality_data["clean_posts"])
count_vectorized_data = pd.DataFrame(
    data=count_words.toarray(), columns=count_vectorizer.get_feature_names()
)

<IPython.core.display.Javascript object>

In [25]:
count_vectorized_data.head()

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,act,...,youll,young,younger,youre,youre going,youre right,youtube,youtube youtube,youve,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,16,3,1,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,1,0,0,0
2,2,1,0,2,0,0,0,0,1,0,...,0,0,0,3,0,1,3,0,0,1
3,0,2,1,0,0,0,0,0,0,0,...,1,0,0,6,0,0,2,0,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


<IPython.core.display.Javascript object>

In [26]:
# Saving the Count vectorized data
count_vectorized_data.to_csv("data_ekta/count_vectorized_data.csv", index=False)

<IPython.core.display.Javascript object>