# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 
    
## COUNTS & VECTORIZATION

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd
import os

# feature engineering
import re
import nltk

# vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# performance check
import time

In [2]:
# reading the clean_data_2 file
personality_data = pd.read_csv(os.path.join("..", "data", "clean_data_2.csv"))

In [3]:
# lookign at the top 5 rows of the dataset
personality_data.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,ADV_avg,CONJ_avg,DET_avg,NOUN_avg,NUM_avg,PRT_avg,PRON_avg,VERB_avg,._avg,X_avg
0,INTJ,0,0,1,1,"'Don’t peg your rate low, peg it high. An appe...",peg rate low peg high appeal expertise want...,0.9999,0.332011,0.098166,...,2.0,3.0,2.0,6.0,0.0,0.0,3.0,6.0,4.0,0.0
1,INTP,0,0,1,0,'...you get Kid Rock.|||Now it is. Plural: 2sh...,get kid rock plural shy happened skim ...,0.99995,0.312169,0.088457,...,4.0,7.0,5.0,13.0,0.0,0.0,4.0,13.0,8.0,0.0
2,INFJ,0,0,0,1,'To me it seems that Stoicism has been reduced...,seems stoicism reduced colloquial usage beco...,0.9992,0.304233,0.15534,...,1.0,2.0,1.0,4.0,0.0,0.0,2.0,3.0,2.0,0.0
3,INTP,0,0,1,0,'unBELIEVABLY based...same with physics|||yeah...,unbelievably based physic yeah got material...,0.9997,0.392857,0.081985,...,1.0,2.0,1.0,5.0,0.0,0.0,2.0,4.0,1.0,0.0
4,INTJ,0,0,1,1,"'Tbh, Ne doms are very fun chaotic people that...",tbh doms fun chaotic people talk hour mig...,0.9999,0.416667,0.100324,...,3.0,4.0,2.0,5.0,0.0,0.0,3.5,6.0,2.0,0.0


In [4]:
# checking the number of rows and columns
personality_data.shape

(3933, 113)

### Feature Engineering - III

#### COUNTING

#### Question/Exclamation/Colon/Emoji Count

In [5]:
def unique_words(s):
    unique = set(s.split(" "))
    return len(unique) / 50


def emojis(post):
    # does not include emojis made purely from symbols, only :word:
    emoji_count = 0
    words = post.split()
    for e in words:
        if "http" not in e:
            if e.count(":") == 2:
                emoji_count += 1
    return emoji_count / 50


def colons(post):
    # Includes colons used in emojis
    colon_count = 0
    words = post.split()
    for e in words:
        if "http" not in e:
            colon_count += e.count(":")
    return colon_count / 50

In [6]:
personality_data["qm"] = personality_data["posts"].apply(lambda s: s.count("?") / 50)
personality_data["em"] = personality_data["posts"].apply(lambda s: s.count("!") / 50)
personality_data["colons"] = personality_data["posts"].apply(colons)
personality_data["emojis"] = personality_data["posts"].apply(emojis)

#### Word Count

In [7]:
personality_data["word_count"] = personality_data["posts"].apply(
    lambda s: (s.count(" ") + 1) / 50
)
personality_data["unique_words"] = personality_data["posts"].apply(unique_words)

#### Word Stats

* CAUTION - This will take Long !!

In [8]:
# stats

t = time.time()

# personality_data["avg_word_ct"] = personality_data["word_count"].apply(lambda s: s / 50)

personality_data["post_length_var"] = personality_data["posts"].apply(
    lambda x: np.var([len(post.split()) for post in x.split("|||")])
)

print(f"Time Taken: {time.time() - t}")

Time Taken: 1.3342020511627197


#### Upper Case Count

In [9]:
personality_data["upper"] = personality_data["posts"].apply(
    lambda x: len([x for x in x.split() if x.isupper()]) / 50
)

#### Link Count

In [10]:
personality_data["link_count"] = personality_data["posts"].apply(
    lambda s: s.count("http") / 50
)

#### Ellipses Count

In [11]:
ellipses_count = [
    len(re.findall(r"\.\.\.\ ", posts)) / 50 for posts in personality_data["posts"]
]
personality_data["ellipses"] = ellipses_count

#### Image Count

In [12]:
personality_data["img_count"] = [
    len(re.findall(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.png)", post)) / 50
    for post in personality_data["posts"]
]

In [13]:
personality_data.head(2)

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,em,colons,emojis,word_count,unique_words,post_length_var,upper,link_count,ellipses,img_count
0,INTJ,0,0,1,1,"'Don’t peg your rate low, peg it high. An appe...",peg rate low peg high appeal expertise want...,0.9999,0.332011,0.098166,...,0.08,0.16,0.0,56.22,24.66,6718.126848,2.06,0.0,0.04,0.0
1,INTP,0,0,1,0,'...you get Kid Rock.|||Now it is. Plural: 2sh...,get kid rock plural shy happened skim ...,0.99995,0.312169,0.088457,...,0.2,2.04,0.0,136.48,48.64,15882.597104,6.98,0.0,0.34,0.0


In [14]:
# checking the data types to make sure they still look good
personality_data.dtypes

type                object
is_Extrovert         int64
is_Sensing           int64
is_Thinking          int64
is_Judging           int64
                    ...   
post_length_var    float64
upper              float64
link_count         float64
ellipses           float64
img_count          float64
Length: 124, dtype: object

In [15]:
# checking for null values again
personality_data.isnull().sum()

type               0
is_Extrovert       0
is_Sensing         0
is_Thinking        0
is_Judging         0
                  ..
post_length_var    0
upper              0
link_count         0
ellipses           0
img_count          0
Length: 124, dtype: int64

In [16]:
# Saving the data with counts
personality_data.to_csv(os.path.join("..", "data", "clean_data_3.csv"), index=False)

### Vectorize - For analysis purpose only. For model, the vectorization will be added to the pipeline.

In [17]:
# Using TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=25, max_df=0.8)
tfidf_words = tfidf_vectorizer.fit_transform(personality_data["clean_posts"])
tfidf_vectorized_data = pd.DataFrame(
    data=tfidf_words.toarray(), columns=tfidf_vectorizer.get_feature_names_out()
)

In [18]:
tfidf_vectorized_data.head()

Unnamed: 0,aaa,aaaa,aaron,aback,abandon,abandoned,abandoning,abandonment,abbreviation,abc,...,zodiac,zombie,zone,zoned,zoning,zoo,zoom,zoomed,zuko,zur
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0181,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.027067,0.050386,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Saving the TF-IDF vectorized data
tfidf_vectorized_data.to_csv(os.path.join("..", "data", "tfidf_vectorized_data.csv"), index=False)

In [20]:
# Using CountVectorizer

count_vectorizer = CountVectorizer(decode_error="ignore", min_df=25, max_df=0.8,)

count_words = count_vectorizer.fit_transform(personality_data["clean_posts"])
count_vectorized_data = pd.DataFrame(
    data=count_words.toarray(), columns=count_vectorizer.get_feature_names_out()
)

In [21]:
count_vectorized_data.head()

Unnamed: 0,aaa,aaaa,aaron,aback,abandon,abandoned,abandoning,abandonment,abbreviation,abc,...,zodiac,zombie,zone,zoned,zoning,zoo,zoom,zoomed,zuko,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0


In [22]:
# Saving the Count vectorized data
count_vectorized_data.to_csv(os.path.join("..", "data", "count_vectorized_data.csv"), index=False)