In [2]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import os

words = set(nltk.corpus.words.words())
stop_list = []

# create a list of stop_words
def create_custom_stopwords():
    stop_list = set(stopwords.words('english'))
    return stop_list

# remove punctuations from tweet
def remove_punctuation(data):
    result = "".join(re.sub("([\[\]\&\$\%\(\):\/]|(\.,)|(\.\.)\.*|(\-)|(\!\!)+|[^\x00-\x7F])+", " ", t) for t in data)
    result = "".join(w for w in result if w not in string.punctuation)
    return result

# remove numbers from tweet
def remove_numbers(data):
    result = "".join(re.sub("(\d)+", " ", t) for t in data)
    return result

# remove stopwords from tweet
def remove_stopwords(data):
    result = " ".join(t for t in data.split() if t.lower() not in stop_list)
    return result

# apply stemming on tweet to get root words
def stemming(data):
    stemmer = PorterStemmer()
    result = " ".join(stemmer.stem(t) for t in data.split())
    return result

# apply lemmatization on tweet to get root words
def lemmatize(tweet):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(tweet)
    tagged_tokens = nltk.pos_tag(word_tokens)
    result = ""
    for word, tag in tagged_tokens:
        pos_tag = get_wordnet_pos(tag)
        if pos_tag is None:
            result += " " + lemmatizer.lemmatize(word)
        else:
            result += " " + lemmatizer.lemmatize(word, pos=pos_tag)
    return result

# implement part-of-speech tagging to prepare for lemmatization
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# split tweet into word tokens
def tokenize(tweets):
    result = []
    for tweet in tweets:
        word_tokens = word_tokenize(tweet)
        result.append(word_tokens)
    return result

# collection of methods to clean tweet content
def normalize(tweet):
    tweet = tweet.apply(lambda x: x.lower())
    tweet = tweet.apply(lambda x: remove_numbers(x))
    tweet = tweet.apply(lambda x: remove_punctuation(x))
    return tweet

# convert dataset to dataframe
tweet_df = pd.read_csv('finalOutput_crawled.csv', encoding = "ISO-8859-1", header = 0)

# initializing list of custom stopwords
stop_list = create_custom_stopwords()

tweet_df = tweet_df.iloc[:,0:3]

# adding an extra column to store the cleaned tweets
tweet_df["clean_tweet"] = tweet_df.iloc[:,1]

# remove http/https links, RT
tweet_df["clean_tweet"] = tweet_df["clean_tweet"].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

tweet_df["clean_tweet"] = tweet_df["clean_tweet"].apply(lambda x: re.split('RT', str(x))[0])

tweet_df["clean_tweet"] = tweet_df["clean_tweet"].apply(lambda x: re.split('http|https', str(x))[0])

# clean the tweet contents
tweet_df["clean_tweet"] = normalize(tweet_df["clean_tweet"])

# convert to root form
tweet_df["clean_tweet"] = tweet_df["clean_tweet"].apply(lambda x: lemmatize(x))

# remove stopwords
tweet_df["clean_tweet"] = tweet_df["clean_tweet"].apply(lambda x: remove_stopwords(x))

# storing new dataset into file
tweet_df.to_csv("finalOutput_cleaned.csv",index=False)

print(tweet_df["clean_tweet"].head(5))


0    poll south carolina minute closing voter cast ...
1    united state top doctor one simple request sto...
2    around south carolina voter democratic primary...
3    alexa negr n luciano transgender woman kill ci...
4    demographic south carolina democratic primary ...
Name: clean_tweet, dtype: object
