In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings("ignore")


In [None]:
dataset1 = pd.read_csv("mbti_1.csv")
dataset2 = pd.read_csv("mbti_full_pull.csv")

In [None]:
dataset1.info()

In [None]:
dataset2.info()

In [None]:
valid_personalities = pd.unique(dataset1['type']).tolist()
valid_personalities

In [None]:
to_search = valid_personalities + [a for a in map(str.lower, valid_personalities)]
to_search += [a for a in map(str.capitalize, valid_personalities)]

In [None]:
print(to_search)

In [None]:
df = dataset2.loc[dataset2['subreddit'].isin(to_search)]

In [None]:
df.info()
df = df[['body', 'subreddit']]

In [None]:
df['subreddit'] = df['subreddit'].apply(lambda x: str.upper(x))

In [None]:
pd.unique(df['subreddit']).tolist()

In [None]:
df.head()

In [None]:
total = df.groupby(['subreddit']).count()
total


plt.figure(figsize = (12,6))

plt.bar(np.array(total.index), height = total['body'],)
plt.xlabel('Personality types', size = 14)
plt.ylabel('Number of posts available', size = 14)
plt.title('Total posts for each personality type')

In [None]:
df2 = pd.DataFrame(columns=['type', 'post'])

i=0
for index, row in dataset1.iterrows():
    split_row = dataset1.iloc[index,1].split('|||')
    for post in split_row:
        dictio = {'type' : row['type'], 'post': post}
        df2 = df2.append(dictio, ignore_index = True)
        i+=1

In [None]:
df = df.rename(columns={"body": "post", "subreddit": "type"})
df = pd.concat([df, df2])

In [None]:
import re

def preprocess_text(df, remove_special=True):
    texts = df['post'].copy()
    labels = df['type'].copy()

    #Remove links 
    df["post"] = df["post"].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', str(x).replace("|"," ") + " "))
    
    #Keep the End Of Sentence characters
    df["post"] = df["post"].apply(lambda x: re.sub(r'\.', ' EOSTokenDot ', str(x) + " "))
    df["post"] = df["post"].apply(lambda x: re.sub(r'\?', ' EOSTokenQuest ', str(x) + " "))
    df["post"] = df["post"].apply(lambda x: re.sub(r'!', ' EOSTokenExs ', str(x) + " "))
    
    #Strip Punctation
    df["post"] = df["post"].apply(lambda x: re.sub(r'[\.+]', ".",str(x)))

    #Remove multiple fullstops
    df["post"] = df["post"].apply(lambda x: re.sub(r'[^\w\s]','',str(x)))

    #Remove Non-words
    df["post"] = df["post"].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',str(x)))

    #Convert posts to lowercase
    df["post"] = df["post"].apply(lambda x: str(x).lower())

    #Remove multiple letter repeating words
    df["post"] = df["post"].apply(lambda x: re.sub(r'([a-z])\1{2,}[\s|\w]*','',str(x))) 

    #Remove very short or long words
    df["post"] = df["post"].apply(lambda x: re.sub(r'(\b\w{0,3})?\b','',str(x)))
    df["post"] = df["post"].apply(lambda x: re.sub(r'(\b\w{30,1000})?\b','',str(x)))

    #Remove MBTI Personality Words - crutial in order to get valid model accuracy estimation for unseen data. 
    if remove_special:
        pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
        pers_types = [p.lower() for p in pers_types]
        p = re.compile("(" + "|".join(pers_types) + ")")
        df['new'] = df['string'].str.replace(pat, '')
    
    return df

#Preprocessing of entered Text
new_df = preprocess_text(df)

In [None]:
print(new_df.info())
new_df.head()

In [None]:
new_df.to_csv('mbti_cleaned.csv')

In [None]:
short = new_df.groupby('type').apply(lambda x: x.sample(n=1907)).reset_index(drop = True)
short.to_csv('mbti_short.csv')