In [165]:
# Import Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

In [166]:
# Import data 
pd.set_option('display.max_colwidth', None)
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']

df = pd.read_csv('../Data/Tweets.csv', encoding = 'latin1', header=None, names=column_names)

In [167]:
# Drop useless columns
drop_columns = ['ids', 'date', 'flag', 'user']

df.drop(columns=drop_columns, axis=1, inplace=True)

In [168]:
# Convert target to a binary classifier (-1 or 1)
df['target'].replace({0:-1,4:1},inplace=True)

In [169]:
# Remove urls
url_pattern = re.compile(r'https?://\S+|www\.\S+|\b\w+\.com\b|\b\w+\.org\b')
df['text'] = df['text'].apply(lambda x: url_pattern.sub('', x))

In [170]:
# Remove usernames
username_pattern = re.compile(r'@\w+')
df['text'] = df['text'].apply(lambda x: username_pattern.sub('', x))

In [171]:
# Create a copy to perform further processing
df_raw = df.copy(deep=True)

In [172]:
# Remove stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [173]:
# Remove empty tweets
df['text'] = df['text'].apply(lambda x: x.strip())

print("Empty tweets before: ", len(df[df["text"]==""]))
df.drop(df[df['text']==""].index, inplace=True)
print("Empty tweets after: ", len(df[df["text"]==""]))

Empty tweets before:  4544
Empty tweets after:  0


In [174]:
# Partition data
df_train = df.sample(n=10000, random_state=17)
df_test = df.sample(n=5000, random_state=37)
df_validate = df.sample(n=2000, random_state=71)

df_raw_train = df_raw.iloc[df_train.index]
df_raw_test = df_raw.iloc[df_test.index]
df_raw_validate = df_raw.iloc[df_validate.index]

In [176]:
# Functions for lemmatization and stemming         
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(sentence):
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()

    words = sentence.split()
    result = ' '.join([(wnl.lemmatize(w, get_wordnet_pos(w))) for w in words])

    return result

In [179]:
# Lemmatize the tweets
df_train['text'] = df_train['text'].apply(lemmatize)
df_test['text'] = df_test['text'].apply(lemmatize)
df_validate['text'] = df_validate['text'].apply(lemmatize)

In [180]:
# Analyze target distribution
print("Count:\t\t  -1   1")
print("df_train \t", len(df_train[df_train["target"]==-1]), len(df_train[df_train["target"]==1]))
print("df_raw_train \t", len(df_raw_train[df_raw_train["target"]==-1]), len(df_raw_train[df_raw_train["target"]==1]))
print("df_test \t", len(df_test[df_test["target"]==-1]), len(df_test[df_test["target"]==1]))
print("df_raw_test \t", len(df_raw_test[df_raw_test["target"]==-1]), len(df_raw_test[df_raw_test["target"]==1]))
print("df_validate \t", len(df_validate[df_validate["target"]==-1]), len(df_validate[df_validate["target"]==1]))
print("df_raw_validate ", len(df_raw_validate[df_raw_validate["target"]==-1]), len(df_raw_validate[df_raw_validate["target"]==1]))

Count:		  -1   1
df_train 	 4939 5061
df_raw_train 	 4939 5061
df_test 	 2472 2528
df_raw_test 	 2472 2528
df_validate 	 947 1053
df_raw_validate  947 1053


In [184]:
# Output final CSv files
df_train.to_csv("../Data/data_processed_train.csv", index=False)
df_test.to_csv("../Data/data_processed_test.csv", index=False)
df_validate.to_csv("../Data/data_processed_validate.csv", index=False)

df_raw_train.to_csv("../Data/data_unprocessed_train.csv", index=False)
df_raw_test.to_csv("../Data/data_unprocessed_test.csv", index=False)
df_raw_validate.to_csv("../Data/data_unprocessed_validate.csv", index=False)