In [1]:
import pandas as pd
import re
import os

In [2]:
#Variables
data_path = os.path.join('data')
img_path = os.path.join('static','images')
re_link = re.compile(r"(https?:?\/\/)?(www[1-3]?.?)?((([a-zA-Z0-9\-]*)\.?)*)")

In [3]:
def unique_words(s):
    unique = set(s.split(' ')) 
    return len(unique)

def emojis(post):
    # does not include emojis made purely from symbols, only :word:
    emoji_count = 0
    words = post.split()
    for e in words:
        if 'http' not in e:
            if e.count(':')==2:
                emoji_count+=1
    return emoji_count

def colons(post):
    # Includes colons used in emojis
    colon_count = 0
    words = post.split()
    for e in words:
        if 'http' not in e:
            colon_count+=e.count(':')
    return colon_count

In [4]:
#Pull in csv
mbti_user = pd.read_csv(os.path.join(data_path,'mbti_1.csv'))

## Split posts into individual rows

In [5]:
posts = mbti_user.apply(lambda x: pd.Series(x['posts'].split('|||')), axis=1).stack().reset_index(level=1, drop=True)
posts.name = 'post'
mbti_post = mbti_user[['type']].join(posts)
mbti_post['post'] = pd.Series(mbti_post['post'], dtype=object)
mbti_post = mbti_post.reset_index().rename(columns={'index':'id'})

## Cleaning Posts

In [6]:
#Replace links with domains
mbti_post['clean_post'] = mbti_post['post'].str.replace(
    re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+).*"),
    lambda match: match.group(2)
    )

In [7]:
# replacing emails with space
mbti_post["clean_post"] = mbti_post["clean_post"].str.replace(
    re.compile(r"\S+@\S+"), " "
    )

In [8]:
# converting posts into lower case
mbti_post["clean_post"] = mbti_post["clean_post"].str.lower()

In [9]:
# replacing MBTIs with space
mbti = ["INFP", "INFJ", "INTP", "INTJ", "ENTP", "enfp", "ISTP", "ISFP", "ENTJ", "ISTJ", "ENFJ", "ISFJ", "ESTP", "ESFP", "ESFJ", "ESTJ"]

for type_word in mbti:
    mbti_post["clean_post"] = mbti_post["clean_post"].str.replace(
        type_word.lower(), " ")
#     personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
#         type_word, " ")

## Some counts

In [10]:
#Splitting out the different personality elements
mbti_post['is_extrovert'] = mbti_post['type'].apply(lambda s : 1 if s[0] == 'E' else 0)
mbti_post['is_sensor'] = mbti_post['type'].apply(lambda s : 1 if s[1] == 'S' else 0)
mbti_post['is_thinker'] = mbti_post['type'].apply(lambda s : 1 if s[2] == 'T' else 0)
mbti_post['is_judger'] = mbti_post['type'].apply(lambda s : 1 if s[3] == 'J' else 0)

In [11]:
#Counting grammer
mbti_post["upper"] = mbti_post["post"].apply(lambda x: len(
                        [x for x in x.split() if x.isupper()]))
mbti_post['char_count'] = mbti_post['clean_post'].apply(len)
mbti_post['qm'] = mbti_post['clean_post'].apply(lambda s : s.count('?'))
mbti_post['em'] = mbti_post['clean_post'].apply(lambda s : s.count('!'))
mbti_post['colons'] = mbti_post['clean_post'].apply(colons)
mbti_post['emojis'] = mbti_post['clean_post'].apply(emojis)
# counting ellipses per post 
mbti_post['ellipses'] = [len(re.findall(r'\.\.\.\ ', post)) for post in mbti_post['clean_post']]

In [12]:
#Counting words
mbti_post['unique_words'] = mbti_post['clean_post'].apply(unique_words)
mbti_post['word_count'] = mbti_post['clean_post'].apply(lambda s : s.count(' ')+1)

In [13]:
#Counting links
mbti_post['link_count'] = mbti_post['post'].apply(lambda s : s.count('http'))
mbti_post['youtube_link'] = mbti_post['post'].apply(lambda s : s.count('youtube') + s.count('youtu.be'))

# counting images per post 
mbti_post["img_count"] = [len(re.findall(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.png)", post))
                  for post in mbti_post["post"]]

## Remove some rows

In [14]:
mbti_post.drop(mbti_post[mbti_post["word_count"] < 3].index, inplace=True)
mbti_post.reset_index(drop=True, inplace=True)
mbti_post.head(2)

Unnamed: 0,id,type,post,clean_post,is_extrovert,is_sensor,is_thinker,is_judger,upper,char_count,qm,em,colons,emojis,ellipses,unique_words,word_count,link_count,youtube_link,img_count
0,0,INFJ,enfp and intj moments https://www.youtube.com...,and moments youtube,0,0,0,1,0,24,0,0,0,0,0,4,8,2,2,0
1,0,INFJ,What has been the most life-changing experienc...,what has been the most life-changing experienc...,0,0,0,1,0,61,1,0,0,0,0,10,10,0,0,0


## Export data to csv

In [15]:
mbti_post.to_csv(os.path.join(data_path,'mbti_phase1.csv'))