# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 
    

## SENTIMENT ANALYSIS & PART OF SPEECH TAGGING

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd
import os

# feature engineering
import re

# pos tagging
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')

# sentiment scoring
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# scaling to handle negative values in sentiment scores (for Naive Bayes)
from sklearn.preprocessing import MinMaxScaler

# performance check
import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gulev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gulev\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Feature Engineering - II

In [2]:
# reading the clean_dataset_1
personality_data = pd.read_csv(os.path.join("..", "data", "clean_data_1.csv"))

In [3]:
# looking at the top 5 rows of the dataset
personality_data.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts
0,INTJ,0,0,1,1,"'Don’t peg your rate low, peg it high. An appe...",peg rate low peg high appeal expertise want...
1,INTP,0,0,1,0,'...you get Kid Rock.|||Now it is. Plural: 2sh...,get kid rock plural shy happened skim ...
2,INFJ,0,0,0,1,'To me it seems that Stoicism has been reduced...,seems stoicism reduced colloquial usage beco...
3,INTP,0,0,1,0,'unBELIEVABLY based...same with physics|||yeah...,unbelievably based physic yeah got material...
4,INTJ,0,0,1,1,"'Tbh, Ne doms are very fun chaotic people that...",tbh doms fun chaotic people talk hour mig...


In [4]:
# checking the number of rows and columns
personality_data.shape

(3933, 7)

#### Checking for Null values

In [5]:
# checking for missing values
personality_data.isnull().sum()

type            0
is_Extrovert    0
is_Sensing      0
is_Thinking     0
is_Judging      0
posts           0
clean_posts     0
dtype: int64

There are no missing values present in this dataset.

### Sentiments Analysis Score

* CAUTION - Sentiment scoring will take LONG !!

In [6]:
# sentiment scoring for each user
t = time.time()

analyzer = SentimentIntensityAnalyzer()

nlp_sentiment_score = []

for post in personality_data["clean_posts"]:
    score = analyzer.polarity_scores(post)
    nlp_sentiment_score.append(score)

print(f"Sentiment Scoring Time: {time.time() - t:.2f} seconds")

Sentiment Scoring Time: 2853.37 seconds


In [7]:
# segregating the indiviual sentiment scores - compound, positive, negative and neutral
personality_data["compound_sentiment"] = [
    score["compound"] for score in nlp_sentiment_score
]
personality_data["pos_sentiment"] = [score["pos"] for score in nlp_sentiment_score]
personality_data["neg_sentiment"] = [score["neg"] for score in nlp_sentiment_score]
personality_data["neu_sentiment"] = [score["neu"] for score in nlp_sentiment_score]

In [8]:
# Sentiment scores have negative values that Naive Bayes can't handle. So scaling it.

min_max_scaler = MinMaxScaler()
personality_data["compound_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["compound_sentiment"]).reshape(-1, 1)
)
personality_data["pos_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["pos_sentiment"]).reshape(-1, 1)
)
personality_data["neg_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["neg_sentiment"]).reshape(-1, 1)
)
personality_data["neu_sentiment"] = min_max_scaler.fit_transform(
    np.array(personality_data["neu_sentiment"]).reshape(-1, 1)
)

In [9]:
# checking to see if sentiment scores introduced any null value
personality_data.isnull().sum()

type                  0
is_Extrovert          0
is_Sensing            0
is_Thinking           0
is_Judging            0
posts                 0
clean_posts           0
compound_sentiment    0
pos_sentiment         0
neg_sentiment         0
neu_sentiment         0
dtype: int64

### POS Tagging

In [10]:
# creating tag_posts column that will have each post as a separate list in a row. tag_posts will be a list of 50 lists.

# replacing urls with domain name
personality_data["tag_posts"] = personality_data["posts"].str.replace(
    re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
    lambda match: match.group(2),
    regex=True
)

# replacing ||| with space
personality_data["tag_posts"] = [
    post for post in personality_data["tag_posts"].str.split("\|\|\|")
]

* CAUTION - The next step i.e. Parts of speech tagging for each word will take SUPER LONG !!!

In [11]:
# parts of speech tagging for each word
t = time.time()

personality_data["tagged_words"] = personality_data["tag_posts"].apply(
    lambda x: [nltk.pos_tag(word_tokenize(line)) for line in x]
)

print(f"POS Tagging Time: {time.time() - t} seconds")

POS Tagging Time: 999.1841609477997 seconds


In [12]:
# creating list of unique POS tags
tag_set = set()

for i, data in personality_data["tagged_words"].items():
    for tup in data[0]:
        tag_set.add(tup[1])

tag_list = list(tag_set)

In [13]:
# calculating mean and standard deviation of pos tags for each user
t = time.time()


def pos_cat(x, tag):
    return [len([y for y in line if y[1] == tag]) for line in x]


for col in tag_list:
    personality_data["POS_" + col + "_mean"] = personality_data["tagged_words"].apply(
        lambda x: np.mean(pos_cat(x, col))
    )
    personality_data["POS_" + col + "_std"] = personality_data["tagged_words"].apply(
        lambda x: np.std(pos_cat(x, col))
    )

print(f"POS Stats Time: {time.time() - t} seconds")

POS Stats Time: 127.52453589439392 seconds


In [14]:
# grouping pos tags based on stanford list
tags_dict = {
    "ADJ": ["JJ", "JJR", "JJS"],
    "ADP": ["EX", "TO"],
    "ADV": ["RB", "RBR", "RBS", "WRB"],
    "CONJ": ["CC", "IN"],
    "DET": ["DT", "PDT", "WDT"],
    "NOUN": ["NN", "NNS", "NNP", "NNPS"],
    "NUM": ["CD"],
    "PRT": ["RP"],
    "PRON": ["PRP", "PRP$", "WP", "WP$"],
    "VERB": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
    ".": ["#", "$", "''", "(", ")", ",", ".", ":"],
    "X": ["FW", "LS", "UH"],
}

In [15]:
# Stanford POS tag stats
t = time.time()


def stanford_tag(x, tag):
    tags_list = [len([y for y in line if y[1] in tags_dict[col]]) for line in x]
    return tags_list


for col in tags_dict.keys():
    personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
        lambda x: np.median(stanford_tag(x, col))
    )

print(f"Stanford POS Stats Time: {time.time() - t} seconds")

  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(
  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(


Stanford POS Stats Time: 33.8388512134552 seconds


  personality_data[col + "_avg"] = personality_data["tagged_words"].apply(


In [16]:
# a quick look at the data
personality_data.head(2)

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,ADV_avg,CONJ_avg,DET_avg,NOUN_avg,NUM_avg,PRT_avg,PRON_avg,VERB_avg,._avg,X_avg
0,INTJ,0,0,1,1,"'Don’t peg your rate low, peg it high. An appe...",peg rate low peg high appeal expertise want...,0.9999,0.332011,0.098166,...,2.0,3.0,2.0,6.0,0.0,0.0,3.0,6.0,4.0,0.0
1,INTP,0,0,1,0,'...you get Kid Rock.|||Now it is. Plural: 2sh...,get kid rock plural shy happened skim ...,0.99995,0.312169,0.088457,...,4.0,7.0,5.0,13.0,0.0,0.0,4.0,13.0,8.0,0.0


In [17]:
# Sentiment scoring & POS Tagging took long. So saving the scored & tagged file to save time in the next step.
personality_data.to_csv(os.path.join("..", "data", "clean_data_2.csv"), index=False)