In [76]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Read data from Excel Sheet

In [77]:
df_obama = pd.read_excel('training-Obama-Romney-tweets.xlsx', sheet_name='Obama')
df_romney = pd.read_excel('training-Obama-Romney-tweets.xlsx', sheet_name='Romney')

In [78]:
df_obama.head()

Unnamed: 0.1,Unnamed: 0,date,time,Anootated tweet,Unnamed: 4,Unnamed: 5
0,,,,"1: positive, -1: negative, 0: neutral, 2: mixed",Class,Your class
1,,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,
2,,2016-12-10 00:00:00,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2,
3,,2012-10-16 00:00:00,10:04:30-05:00,#<e>obama</e> debates that Cracker Ass Cracker...,1,
4,,2012-10-16 00:00:00,10:00:36-05:00,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2,


In [79]:
df_romney.head()

Unnamed: 0.1,Unnamed: 0,date,time,Anootated tweet,Unnamed: 4,Unnamed: 5
0,,,,"1: positive, -1: negative, 0: neutral, 2: mixed",Class,Your class label
1,,2012-10-16 00:00:00,09:38:08-05:00,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...,-1,
2,,2012-10-16 00:00:00,10:22:34-05:00,Senior <e>Romney</e> Advisor Claims <e>Obama</...,2,
3,,2012-10-16 00:00:00,10:14:18-05:00,.@WardBrenda @shortwave8669 @allanbourdius you...,-1,
4,,2012-10-16 00:00:00,09:27:16-05:00,<e>Mitt Romney</e> still doesn't <a>believe</a...,-1,


# Data Cleaning

In [80]:
def clean_data(df):
    df = df.drop(['Unnamed: 0', 'date', 'time', 'Unnamed: 5'], axis=1)
    df = df.rename(columns = {'Unnamed: 4': 'class', 'Anootated tweet': 'tweets'})
    df = df.dropna()
    df = df[~df['class'].isin(['irrevelant', 'irrelevant', 'IR', 'Class', '!!!!', 2])]
    df['class'] = df['class'].astype(int)
    df = df.drop(df[df['class'] == 2].index)
    df = df.reset_index(drop=True)

    return df

df_obama = clean_data(df_obama)
df_romney = clean_data(df_romney)

def clean_tweets(text):
    text = text.lower()
    text = re.sub('<[^>]+>', '', text) # remove HTML tags
    text = re.sub('@[^\s]+','',text) # remove usernames
    text = re.sub('[<>!()_*/~"|@$#:.,%\?-]+', '', text) # remove punctuation and special characters
    text = re.sub('http\S+', '', text) # remove URLs
    text = text.encode('ascii', 'ignore').decode('ascii') # remove non-ascii characters
    text = re.sub(' +', ' ', text).strip() # remove extra spaces
    if text == '':
        return None

    return text

df_obama['tweets'] = df_obama['tweets'].apply(clean_tweets)
df_romney['tweets'] = df_romney['tweets'].apply(clean_tweets)

df_obama = df_obama.dropna()
df_romney = df_romney.dropna()

def tokenize(tweet):
    ps = PorterStemmer()

    tokens = word_tokenize(tweet) # tokenize words
    cleaned_tokens = [ word for word in tokens if word not in stopwords.words('english')] # remove stopwords
    stemmed_tokens = [ ps.stem(word) for word in cleaned_tokens] # stem words
    new_tweet = ' '.join(stemmed_tokens) # join words back into a string
    if new_tweet == '':
        return None

    return new_tweet

df_obama['tweets'] = df_obama['tweets'].apply(tokenize)
df_romney['tweets'] = df_romney['tweets'].apply(tokenize)

df_obama = df_obama.dropna()
df_romney = df_romney.dropna()

In [81]:
df_obama.to_csv('data/obama_cleaned.csv', index=False)
df_romney.to_csv('data/romney_cleaned.csv', index=False)

In [82]:
df_obama.head()

Unnamed: 0,tweets,class
0,kirkpatrick wore basebal cap embroid barack ob...,0
1,obama debat cracker ass cracker tonight tune t...,1
2,your miss point im afraid understand bigger pi...,0
3,rais democrat left parti year ago 1980 lifetim...,-1
4,obama camp ca n't afford lower expect tonight ...,0


In [83]:
df_romney.head()

Unnamed: 0,tweets,class
0,insidiousmitt romney 's bain help philip morri...,-1
1,mean like romney cheat primari,-1
2,mitt romney still n't believ black presid,-1
3,romney 's tax plan deserv 2nd look secret one ...,-1
4,hope romney debat prep w peopl last time,1
