In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Read data from Excel Sheet

In [2]:
df_obama = pd.read_excel('data/final-testData-no-label-Obama-tweets.xlsx', sheet_name='Obama', header=None, index_col=0, keep_default_na=False, names=['tweets'], dtype='string')
df_romney = pd.read_excel('data/final-testData-no-label-Romney-tweets.xlsx', sheet_name='Romney', header=None, index_col=0, keep_default_na=False, names=['tweets'], dtype='string')

In [3]:
df_obama.head()

Unnamed: 0,tweets
1,<e>Obama</e> has to maintain his professionali...
2,<e>Obama</e> went into the debate swinging and...
3,Ditto. I started @247LS 4 years ago. RT @bmorr...
4,I absolutely love <e>Obama</e>'s view in <a>im...
5,I'm agreeing completely with <e>Obama</e>'s st...


In [22]:
df_romney.head()

Unnamed: 0,tweets
1,<e>Romney</e> got 3 less minutes and had to de...
2,<e>Mitt </e>is beating him UP! on his record...
3,I actually like <e>Romney </e>'s response to ...
4,Just for that <a>immigration statement </a>tha...
5,This man <e>Romney </e>is tearing this dude ...


# Data Cleaning

In [4]:
def clean_data(df):
    df = df.drop(['Unnamed: 0', 'date', 'time', 'Unnamed: 5'], axis=1)
    df = df.rename(columns = {'Unnamed: 4': 'class', 'Anootated tweet': 'tweets'})
    df = df.dropna()
    df = df[~df['class'].isin(['irrevelant', 'irrelevant', 'IR', 'Class', '!!!!', 2])]
    df['class'] = df['class'].astype(int)
    df = df.drop(df[df['class'] == 2].index)
    df = df.reset_index(drop=True)

    return df

# df_obama = clean_data(df_obama)
# df_romney = clean_data(df_romney)

def clean_tweets(text):
    text = text.lower()
    text = re.sub('<[^>]+>', '', text) # remove HTML tags
    text = re.sub('@[^\s]+','',text) # remove usernames
    text = re.sub('[<>!()_*/~"|@$#:.,%\?-]+', '', text) # remove punctuation and special characters
    text = re.sub('http\S+', '', text) # remove URLs
    # text = text.encode('ascii', 'ignore').decode('ascii') # remove non-ascii characters
    text = re.sub(' +', ' ', text).strip() # remove extra spaces
    if text == '':
        return None

    return text

df_obama['tweets'] = df_obama['tweets'].apply(clean_tweets)
df_romney['tweets'] = df_romney['tweets'].apply(clean_tweets)

def tokenize(tweet):
    ps = PorterStemmer()

    tokens = word_tokenize(tweet) # tokenize words
    cleaned_tokens = [ word for word in tokens if word not in stopwords.words('english')] # remove stopwords
    stemmed_tokens = [ ps.stem(word) for word in cleaned_tokens] # stem words
    new_tweet = ' '.join(stemmed_tokens) # join words back into a string
    if new_tweet == '':
        return None

    return new_tweet

# df_obama['tweets'] = df_obama['tweets'].apply(tokenize)
# df_romney['tweets'] = df_romney['tweets'].apply(tokenize)

In [5]:
df_obama.to_csv('data/test_obama_cleaned.csv')
df_romney.to_csv('data/test_romney_cleaned.csv')

In [25]:
df_obama.head()

Unnamed: 0,tweets
1,obama has to maintain his professionalism thro...
2,obama went into the debate swinging and came o...
3,ditto i started 4 years ago rt i work for a sm...
4,i absolutely love obama's view in immigration ...
5,i'm agreeing completely with obama's stance on...


In [26]:
df_romney.head()

Unnamed: 0,tweets
1,romney got 3 less minutes and had to debate ca...
2,mitt is beating him up on his record on credib...
3,i actually like romney 's response to immigration
4,just for that immigration statement that romne...
5,this man romney is tearing this dude up on eco...
