In [295]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import matplotlib.pyplot as plt
import os

In [256]:
# Remove unwanted patterns
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chiehhsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chiehhsi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [276]:
# Tweet Cleaner
stopwordlist = set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

def remove_stopwords(tweet):
    new_tweet = [w for w in tweet.split() if not w in stopwordlist] 
    return (" ").join(new_tweet) 

def remove_specialchar(tweet):
    #Converts HTML tags to the characters they represent
    soup = BeautifulSoup(tweet, "html.parser")
    tweet = soup.get_text()
    
    #Convert www.* or https?://* to empty strings
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) 
    #Convert @username to empty strings
    tweet = re.sub('@[^\s]+','',tweet)

    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('[\n]+', ' ', tweet)
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #Trims the tweet
    tweet = tweet.strip('\'"')
    
    #Remove all char not alphabets, num or whitespaces
    tweet = re.sub('[^A-Za-z0-9 ]+','', tweet)
    
    return tweet


#Attempts to replace every individual word with it's root word.
def stemmer_lemmatizer(tweet):
    word_list = []
    for word in tweet.split():
        word = ps.stem(word)
        word = wordnet_lemmatizer.lemmatize(word)
        word_list.append(word)
    return (" ".join(word_list))

#for testing
#Tweets = df_oba['Tweet'].values
#for i in range(len(Tweets)):
#    print(i)
#    b = remove_specialchar(Tweets[i])
#    a = stemmer_lemmatizer(b)
#    print('last', a)
#    print('last', remove_stopwords(a))


In [318]:
def load_csv():
    xls = pd.ExcelFile('training-Obama-Romney-tweets.xlsx')
    df1 = pd.read_excel(xls, 'Obama')
    df2 = pd.read_excel(xls, 'Romney')
    return df1, df2

def data_parse(df):
    
    #drop missing values and rename columns
    df = df.iloc[1:, 3:5]
    df = df.rename(columns = {'Anootated tweet':'Tweet', 'Unnamed: 4': 'Class'})
    # Drop rows if Tweet is empty
    df.dropna(subset = ['Tweet'], inplace=True)
    print(df.shape)
    
    #dtype class label
    df['Class'] = df['Class'].astype(str)
    df['Tweet'] = df['Tweet'].astype(str)
    print(df.info())
    print(df['Class'].value_counts())

    # Extract rows where class labels -1, 0, 1
    df = df[ (df['Class'] == '0') |(df['Class'] == '-1') | (df['Class'] == '1') ]
    print('After extracting:', df.shape)
    
    tweets = df['Tweet'].values
    targets = df['Class'].values
    
    return tweets, targets

def tweet_cleaning(tweets):
    for i in range(len(tweets)):
        #print(i)
        #print('original:', tweets[i])
        tmp = remove_specialchar(tweets[i])
        tmp = stemmer_lemmatizer(tmp)
        tweets[i] = remove_stopwords(tmp)
        #print('final:', tweets[i])
    return tweets

In [319]:
def save_dataset(tweets, targets, tw_name, tar_name):
    print('Saving training dataset...')
    
    #Create Saving Files
    if not os.path.exists('TrainData'):
        os.makedirs('TrainData')
        
    np.save('TrainData/' + tw_name + '.npy', tweets)
    np.save('TrainData/' + tar_name + '.npy', targets)

    print('Saved parsed dataset')
    

In [320]:
if __name__ == "__main__": 
    df1, df2 = load_csv()

In [321]:
tweets_oba, targets_oba = data_parse(df1)
tweets_oba.shape, targets_oba.shape

(7196, 2)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7196 entries, 1 to 7198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   7196 non-null   object
 1   Class   7196 non-null   object
dtypes: object(2)
memory usage: 168.7+ KB
None
0             1977
-1            1968
1             1679
2             1543
irrevelant      23
nan              5
irrelevant       1
Name: Class, dtype: int64
After extracting: (5624, 2)


((5624,), (5624,))

In [322]:
#print(tweets_oba)
tweets_oba = tweet_cleaning(tweets_oba)

In [323]:
tweets_oba

array(['kirkpatrick wore basebal cap embroid barack obama signatur hangdog look jason segel courier journal',
       'obama debat cracker cracker tonight I tune teamobama',
       'miss point Im afraid understand bigger pictur dont care obama elect',
       ...,
       'reason ann romney michel obama match last night michel obama ann romney show last nig',
       'obama kenakan cincin syahadat sejak sma',
       'bitch like obama3 bitch want food stamp lmao'], dtype=object)

In [324]:
save_dataset(tweets_oba, targets_oba, 'tweets_oba', 'targets_oba')

Saving training dataset...
Saved parsed dataset


In [325]:
tweets_rom, targets_rom = data_parse(df2)
tweets_rom.shape, targets_rom.shape

(7200, 2)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7200 entries, 1 to 7200
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   7200 non-null   object
 1   Class   7200 non-null   object
dtypes: object(2)
memory usage: 168.8+ KB
None
-1      2893
0       1680
2       1351
1       1075
!!!!     169
nan       29
IR         3
Name: Class, dtype: int64
After extracting: (5648, 2)


((5648,), (5648,))

In [326]:
print(tweets_rom)
tweets_rom = tweet_cleaning(tweets_rom)

["Insidious!<e>Mitt Romney</e>'s Bain Helped Philip Morris Get U.S. High Schoolers <a>Hooked On Cigarettes</a> http://t.co/nMKuFcUq via @HuffPostPol"
 '.@WardBrenda @shortwave8669 @allanbourdius you mean like <e>romney </e><a>cheated in primary</a>?'
 "<e>Mitt Romney</e> still doesn't <a>believe</a> that we <a>have a black president</a>."
 ...
 'el 59 por ciento de las mujeres blancas casadas respaldan a <e>Romney</e>"""'
 '"And they brought us a whole binder of women"""" oh <e>Romney</e>"""'
 '@FoxNews <e>Romney</e>won"""']


In [327]:
tweets_rom

array(['insidiousmitt romney bain help philip morri get US high schooler hook On cigarett via',
       'mean like romney cheat primari',
       'mitt romney still doesnt believ black presid', ...,
       'el 59 por ciento de la mujer blanca casada respaldan romney',
       'brought u whole binder woman oh romney', 'romneywon'],
      dtype=object)

In [328]:
save_dataset(tweets_rom, targets_rom, 'tweets_rom', 'targets_rom')

Saving training dataset...
Saved parsed dataset
