# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd

# feature engineering
import re
import nltk
# from nltk.corpus import stopwords

# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("averaged_perceptron_tagger")
from nltk.tokenize import word_tokenize, sent_tokenize

# sentiment scoring
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # scaling to handle negative values in sentiment scores (for Naive Bayes)
from sklearn.preprocessing import MinMaxScaler

# performance check
import time

# code formatter
%load_ext nb_black

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<IPython.core.display.Javascript object>

### Feature Engineering - II

In [2]:
# reading the clean_dataset_1
personality_data = pd.read_csv("data_ekta/clean_data_1.csv")

<IPython.core.display.Javascript object>

In [3]:
# lookign at the top 5 rows of the dataset
personality_data.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,youtube moment youtube sportscenter top t...
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...
2,INTP,0,0,1,0,'Good one _____ https://www.youtube.com/wat...,good one youtube course say know thats ble...
3,INTJ,0,0,1,1,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gab...
4,ENTJ,1,0,1,1,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...


<IPython.core.display.Javascript object>

In [4]:
# checking the number of rows and columns
personality_data.shape

(8675, 7)

<IPython.core.display.Javascript object>

#### Checking for Null values

In [5]:
# checking for missing values
personality_data.isnull().sum()

type            0
is_Extrovert    0
is_Sensing      0
is_Thinking     0
is_Judging      0
posts           0
clean_posts     0
dtype: int64

<IPython.core.display.Javascript object>

Luckily there are no missing values present in this dataset.

### Sentiments Analysis Score

* CAUTION - Sentiment scoring will take LONG !!

In [6]:
# sentiment scoring for each user
t = time.time()

analyzer = SentimentIntensityAnalyzer()

nlp_sentiment_score = []

for post in personality_data["clean_posts"]:
    score = analyzer.polarity_scores(post)
    nlp_sentiment_score.append(score)

print(f"Sentiment Scoring Time: {time.time() - t:.2f} seconds")

Sentiment Scoring Time: 821.68 seconds


<IPython.core.display.Javascript object>

In [7]:
# segregating the indiviual sentiment scores - compound, positive, negative and neutral
personality_data["compound_sentiment"] = [
    score["compound"] for score in nlp_sentiment_score
]
personality_data["pos_sentiment"] = [score["pos"] for score in nlp_sentiment_score]
personality_data["neg_sentiment"] = [score["neg"] for score in nlp_sentiment_score]
personality_data["neu_sentiment"] = [score["neu"] for score in nlp_sentiment_score]

<IPython.core.display.Javascript object>

In [8]:
# Sentiment scores have negative values that Naive Bayes can't handle. So scaling it.

min_max_scaler = MinMaxScaler()
personality_data["compound_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["compound_sentiment"]).reshape(-1, 1)
)
personality_data["pos_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["pos_sentiment"]).reshape(-1, 1)
)
personality_data["neg_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["neg_sentiment"]).reshape(-1, 1)
)
personality_data["neu_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["neu_sentiment"]).reshape(-1, 1)
)

<IPython.core.display.Javascript object>

In [9]:
# checking to see if sentiment scores introduced any null value
personality_data.isnull().sum()

type                  0
is_Extrovert          0
is_Sensing            0
is_Thinking           0
is_Judging            0
posts                 0
clean_posts           0
compound_sentiment    0
pos_sentiment         0
neg_sentiment         0
neu_sentiment         0
dtype: int64

<IPython.core.display.Javascript object>

### POS Tagging

In [10]:
# creating tag_posts column that will have each post as a separate list in a row. tag_posts will be a list of 50 lists.

# replacing urls with domain name
personality_data["tag_posts"] = personality_data["posts"].str.replace(
    re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
    lambda match: match.group(2),
)

# replacing ||| with space
personality_data["tag_posts"] = [
    post for post in personality_data["tag_posts"].str.split("\|\|\|")
]

<IPython.core.display.Javascript object>

* CAUTION - The next step i.e. Parts of speech tagging for each word will take SUPER LONG !!!

In [11]:
# parts of speech tagging for each word
t = time.time()

personality_data["tagged_words"] = personality_data["tag_posts"].apply(
    lambda x: [nltk.pos_tag(word_tokenize(line)) for line in x]
)

print(f"POS Tagging Time: {time.time() - t} seconds")

POS Tagging Time: 1075.6390972137451 seconds


<IPython.core.display.Javascript object>

In [12]:
# creating list of unique POS tags
tag_set = set()

for i, data in personality_data["tagged_words"].iteritems():
    for tup in data[0]:
        tag_set.add(tup[1])

tag_list = list(tag_set)

<IPython.core.display.Javascript object>

In [13]:
# calculating mean and standard deviation of pos tags for each user
t = time.time()


def pos_cat(x, tag):
    return [len([y for y in line if y[1] == tag]) for line in x]


for col in tag_list:
    personality_data["POS_" + col + "_mean"] = personality_data["tagged_words"].apply(
        lambda x: np.mean(pos_cat(x, col))
    )
    personality_data["POS_" + col + "_std"] = personality_data["tagged_words"].apply(
        lambda x: np.std(pos_cat(x, col))
    )

print(f"POS Stats Time: {time.time() - t} seconds")

POS Stats Time: 144.66406536102295 seconds


<IPython.core.display.Javascript object>

In [14]:
# grouping pos tags based on stanford list
tags_dict = {
    "ADJ": ["JJ", "JJR", "JJS"],
    "ADP": ["EX", "TO"],
    "ADV": ["RB", "RBR", "RBS", "WRB"],
    "CONJ": ["CC", "IN"],
    "DET": ["DT", "PDT", "WDT"],
    "NOUN": ["NN", "NNS", "NNP", "NNPS"],
    "NUM": ["CD"],
    "PRT": ["RP"],
    "PRON": ["PRP", "PRP$", "WP", "WP$"],
    "VERB": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
    ".": ["#", "$", "''", "(", ")", ",", ".", ":"],
    "X": ["FW", "LS", "UH"],
}

<IPython.core.display.Javascript object>

In [15]:
# Stanford POS tag stats
t = time.time()

def stanford_tag(x, tag):
    tags_list = [len([y for y in line if y[1] in tags_dict[col]]) for line in x]
    return tags_list


for col in tags_dict.keys():
    personality_data["S_" + col + "_med"] = personality_data["tagged_words"].apply(
        lambda x: np.median(stanford_tag(x, col))
    )
    personality_data["S_" + col + "_std"] = personality_data["tagged_words"].apply(
        lambda x: np.std(stanford_tag(x, col))
    )

print(f"Stanford POS Stats Time: {time.time() - t} seconds")

Stanford POS Stats Time: 80.72707939147949 seconds


<IPython.core.display.Javascript object>

In [18]:
# a quick look at the data
personality_data.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,S_PRT_med,S_PRT_std,S_PRON_med,S_PRON_std,S_VERB_med,S_VERB_std,S_._med,S_._std,S_X_med,S_X_std
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,youtube moment youtube sportscenter top t...,0.997599,0.352861,0.292958,...,0.0,0.235294,1.0,1.719016,2.0,2.90403,2.5,1.911538,0.0,0.0
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...,0.99925,0.361035,0.349296,...,0.0,0.362433,4.0,3.036079,6.0,3.768878,4.0,1.995551,0.0,0.147406
2,INTP,0,0,1,0,'Good one _____ https://www.youtube.com/wat...,good one youtube course say know thats ble...,0.9993,0.399183,0.315493,...,0.0,0.20608,2.0,2.091252,4.0,3.347673,3.0,2.468968,0.0,0.284583
3,INTJ,0,0,1,1,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gab...,0.9991,0.311989,0.259155,...,0.0,0.199826,3.0,2.547381,5.0,3.856587,3.0,2.005093,0.0,0.199826
4,ENTJ,1,0,1,1,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...,0.974042,0.318801,0.546479,...,0.0,0.284583,3.0,2.122891,5.0,3.964285,3.0,2.339568,0.0,0.20608


<IPython.core.display.Javascript object>

In [19]:
# Sentiment scoring & POS Tagging took long. So saving the scored & tagged file to save time in the next step.
personality_data.to_csv("data_ekta/clean_data_2.csv", index=False)

<IPython.core.display.Javascript object>