# Load MBTI Dataset

You should extracted the zip file and have a csv file

In [1]:
filename = 'mbti_1.csv'
outfilename = 'mbti_preprocessed.csv'

In [2]:
import numpy as np
import pandas as pd
import re
import string

In [3]:
df_ = pd.read_csv(filename)

# Preview the rows
df_.head(10)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
5,INTJ,'18/37 @.@|||Science is not perfect. No scien...
6,INFJ,"'No, I can't draw on my own nails (haha). Thos..."
7,INTJ,'I tend to build up a collection of things on ...
8,INFJ,"I'm not sure, that's a good question. The dist..."
9,INTP,'https://www.youtube.com/watch?v=w8-egj0y8Qs||...


# Preprocess

## We need to split the posts on the ||| string and create a new row with the same type

In [4]:
newrows = []

def filter_text(post):
    """Decide whether or not we want to use the post."""
    # should remove link only posts here
    return len(post) > 0
    
reg_punc = re.compile('[%s]' % re.escape(string.punctuation))
def preprocess_text(post):
    """Remove any junk we don't want to use in the post."""
    
    # Remove links
    post = re.sub(r'http\S+', '', post, flags=re.MULTILINE)
    
    # All lowercase
    post  = post.lower()
    
    # Remove puncutation
    post = reg_punc.sub('', post)
    
    return post

def create_new_rows(row):
    posts = row['posts'].split('|||')
    rows = []
    
    for p in posts:
        p = preprocess_text(p)
        if not filter_text(p):
            continue
        rows.append({'type': row['type'], 'post': p})
    return rows

for index, row in df_.iterrows():
    newrows += create_new_rows(row)
    
df = pd.DataFrame(newrows)
unique = df.groupby('type').nunique()

In [5]:
print('{} rows'.format(df.shape[0]))

# Preview the data
df.head(10)

411495 rows


Unnamed: 0,post,type
0,enfp and intj moments sportscenter not top ...,INFJ
1,what has been the most lifechanging experience...,INFJ
2,on repeat for most of today,INFJ
3,may the perc experience immerse you,INFJ
4,the last thing my infj friend posted on his fa...,INFJ
5,hello enfj7 sorry to hear of your distress its...,INFJ
6,84389 84390,INFJ
7,welcome and stuff,INFJ
8,game set match,INFJ
9,prozac wellbrutin at least thirty minutes of m...,INFJ


In [6]:
unique.sort_values(by=['post'], ascending=False)

Unnamed: 0_level_0,post,type
type,Unnamed: 1_level_1,Unnamed: 2_level_1
INFP,85936,1
INFJ,69299,1
INTP,60845,1
INTJ,50518,1
ENTP,32731,1
ENFP,31794,1
ISTP,15809,1
ISFP,12289,1
ENTJ,10907,1
ISTJ,9559,1


# Save preprocessed data to csv

In [78]:
df.to_csv(outfilename)