In [1]:
#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import libraries
import pandas as pd
import numpy
import matplotlib
import collections
import re
import string
from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords

In [3]:
#import data and check if data were loaded correctly
df = pd.read_csv('/home/daria/Documents/personality/mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
#Unique values of response varible
collections.Counter(df.type)

Counter({'INFJ': 1470,
         'ENTP': 685,
         'INTP': 1304,
         'INTJ': 1091,
         'ENTJ': 231,
         'ENFJ': 190,
         'INFP': 1832,
         'ENFP': 675,
         'ISFP': 271,
         'ISTP': 337,
         'ISFJ': 166,
         'ISTJ': 205,
         'ESTP': 89,
         'ESFP': 48,
         'ESTJ': 39,
         'ESFJ': 42})

In [5]:
#Preprocess dataset and split each post into separately record
data = pd.DataFrame(columns = ['type', 'posts'])

for i in range(df.shape[0]):
    sent = pd.DataFrame(df.posts[i].split('|||'), columns = ['posts'])
    sent['type'] = df.type[i]
    sent.index.name = None
    
    data = data.append(sent)

In [6]:
#Check it
data.head()

Unnamed: 0,posts,type
0,'http://www.youtube.com/watch?v=qsXHcwe3krw,INFJ
1,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,INFJ
2,enfp and intj moments https://www.youtube.com...,INFJ
3,What has been the most life-changing experienc...,INFJ
4,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,INFJ


In [7]:
#Posts with https links
links = []
for i in range(0, data.shape[0]):
    if re.findall(r'http+', data.posts.iloc[i]) == ['http']:
        links.append(data.posts.iloc[i])

In [8]:
#Percent of posts which contain 'http'
numpy.round(len(links)/len(data)*100,2)

5.36

In [9]:
#Youtube label
data['youtube'] = 'no'
for i in range(0, data.shape[0]):
    if re.findall(r'http+', data.posts.iloc[i]) == ['http']:
        data['youtube'].iloc[i] = 'yes'

In [10]:
#Remove youtube links
data['posts'] = [re.sub('http\S+', '', data.posts.iloc[i]) for i in range(0, len(data))]

In [11]:
data.head()

Unnamed: 0,posts,type,youtube
0,',INFJ,yes
1,,INFJ,yes
2,enfp and intj moments sportscenter not top ...,INFJ,no
3,What has been the most life-changing experienc...,INFJ,no
4,On repeat for most of today.,INFJ,no


In [12]:
#Recognize emojis, firstly replace ')' to '#' 
data.posts = [re.sub(r'\)', '#', s) for s in data.posts]
data.posts = [re.sub(r'\)', '#', s) for s in data.posts]

In [13]:
list_emojis = [':d',':D',':#','xd','XD',';d',';D',';#','=#','=d','=D',':p',';p','=p',
              ':P',';P','=P']

In [14]:
for e in list_emojis:
    data.posts =[re.sub(e, ' emoji ', s) for s in data.posts]

In [15]:
#Replace interpunctions !,?, ..., # to names of these intrepunctions

In [16]:
list_marks = [r'\?', r'\!', r'\...', r'\#']
marks_words = ['question ', ' exclamation ', ' lasting ', ' hashtag ']

In [17]:
dict_marks = dict(zip(list_marks, marks_words))

In [18]:
for m in dict_marks.keys():
    data.posts =[re.sub(m, dict_marks[m], s) for s in data.posts]

In [19]:
#Remove other intepunctions
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
data.posts = [s.translate(remove_punct_map) for s in data.posts]

In [20]:
#Remove stop words
nltk.download('stopwords')
stop = stopwords.words('english')
data['posts'] = data['posts'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

[nltk_data] Downloading package stopwords to /home/daria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
#Remove digits
remove_digit_map = dict.fromkeys(map(ord, string.digits))
data.posts = [s.translate(remove_digit_map) for s in data.posts]

In [22]:
# Transform characters to lower case
data.posts = [s.lower() for s in data.posts] 