# Data Cleaning / Feature Engineering

By Ben Khoung

In [1]:
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
import pandas as pd
import re
import string 

## The Data

In [3]:
mbti = pd.read_csv('data/mbti_1.csv') # Original Dataframe
mbti_clean = mbti.copy() # Dataframe to be cleaned 
mbti_features = mbti.copy() # Dataframe to be populated with features
mbti_features.drop(columns=['posts'],inplace=True)

## General Cleaning

**Removing Links:**

Remove all triple pipes because some links end at the end of a post which making it hard to find with regex. 

In [4]:
# For each user, remove ||| to make it easier to remove links. 
user_posts = [re.sub(r'\|\|\|',' ',posts) for posts in mbti['posts']]
# Remove all links. 
linkless_posts = [re.sub(r'http[\S]* ','',posts) for posts in user_posts]
# Update mbti_clean 
mbti_clean['posts'] = linkless_posts

**Lower Case**:

In [5]:
# Set all user posts to lowercase 
mbti_clean['posts'] = mbti_clean['posts'].str.lower()

## Word Count 

Strip all punctuations and numbers:

In [6]:
# Strip all punctuation. Returns a list of strings 
posts_without_punct = [re.sub(r'[^a-z\s]','',posts) for posts in mbti_clean['posts']]
# Split each element in the list into a list of words. Returns a list of lists 
words = [posts.split() for posts in posts_without_punct]

Stem words, remove stop words, and remove words less than 3 characters in length.

**Caution:** Takes a few minutes to run

In [7]:
## CAUTION: STEMMING WORDS TAKES A FEW MINUTES. UNCOMMENT TO RE-RUN
# Stem words, remove stop words and words less than 3 characters
stemmer = PorterStemmer()
for row in range(len(words)):
    bar.next()
    words[row] = " ".join([stemmer.stem(word) for word in words[row] if word not in list(stop_words.ENGLISH_STOP_WORDS) and len(word) >= 3])


Create a bag of words representation of each user 

In [8]:
# Chose CountVectorizer with min_df = 25 because there were some odd cases like 'aaaaa' 
vectorizer = CountVectorizer(min_df=25)
word_count = vectorizer.fit_transform(words)
# Create word_count dataframe 
word_count_df = pd.DataFrame(data = word_count.toarray(), columns = vectorizer.get_feature_names())
word_count_df.head()

Unnamed: 0,aback,abandon,abbey,abbrevi,abhor,abid,abil,abit,abl,abnorm,...,yup,zealand,zelda,zen,zero,zodiac,zombi,zone,zoo,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,2,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Sentiment Score 

**Caution:** Takes a few minutes to run

In [9]:
scores = []
for i in range(len(posts_without_punct)):
    score = analyzer.polarity_scores(posts_without_punct[i])['compound']
    scores.append(score)
    # Print Progress 
    if i%200 == 0:
        print(i)

NameError: name 'posts_no_punctuation' is not defined

In [None]:
pd.Series(scores).describe()

## Ellipses Count

Counting number of ellipsies used per user.

In [None]:
# Create a list of ellpsies count per user. This is an indicator for long posts 
ellipses_count = [len(re.findall(r'\.\.\.',posts)) for posts in mbti_clean['posts']]
# Append to dataset 
mbti_features['ellipses'] = ellipses_count

## Exclamation Count
Counting number of exclamation marks used per user. 

In [None]:
# Create a list of exclamation count per user. 
exclamation_count = [len(re.findall(r'!',posts)) for posts in mbti_clean['posts']]
# Append to features dataframe
mbti_features['exclamation'] = exclamation_count

## Question Count
Counting number of question marks used per user. 

In [None]:
# Create a list of question count per user. 
question_count = [len(re.findall(r'\?',posts)) for posts in mbti_clean['posts']]
# Append to features dataframe
mbti_features['question'] = question_count

## Link Count 
Counting number of links used per user. 

In [None]:
# For each user, remove ||| to make it easier to find links. 
user_posts = [re.sub(r'\|\|\|',' ',posts) for posts in mbti['posts']]
# Create a list of link count per user. 
link_count = [len(re.findall(r'http[\S]* ', posts)) for posts in user_posts]
# Append to features dataframe
mbti_features['links'] = link_count

## Additional Features to Consider:

* Sentence Length
* N grams (or is this more of a step tuning with count vectorizer? 