# Preprocessing Twitter data

This notebook is to pre-process the Twitter data for topic modeling and sentiment analysis.

Data cleaning:
- Limit to tweets in English
- Transform to all lowercase
- Remove URLs and HTML reference characters
- Remove placeholders
- Remove non-letter characters
- Removes unnecessary columns
- Change date to pd.datetime
- Remove stop words? [This article](https://www.aclweb.org/anthology/L14-1265/) says that removing stop words might affect sentiment analysis performance
- Stem/lemmatize the words?

In [1]:
import pandas as pd
import re
import os

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [2]:
# Pre-process the data
def preprocess_data(df):
    '''
    Pre-processes the data as described above
    '''
    processed_df = df.loc[df.lang == "en", :].copy()
    columns_to_keep = [
        'date', 'content', 'url', 'coordinates', 'place', 'id', 'username', 
        'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
        'conversationId', 'retweetedTweet', 'quotedTweet', 'outlinks', 
        'tcooutlinks', 'media', 'mentionedUsers'
    ]
    
    processed_df = processed_df[columns_to_keep]
    processed_df.date = pd.to_datetime(processed_df.date[:10], yearfirst=True, format="%Y-%m-%d")
    processed_df.content = processed_df.content.str.lower()
    processed_df.content = processed_df.content.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
    processed_df.content = processed_df.content.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
    processed_df.content = processed_df.content.apply(lambda x: re.sub(r'{link}', '', x))
    processed_df.content = processed_df.content.apply(lambda x: re.sub(r"\[video\]", '', x))
    processed_df.content = processed_df.content.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
    processed_df.content = processed_df.content.apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))
    
    processed_df['tokens'] = processed_df['content'].apply(tknzr.tokenize)
    
    return processed_df

In [3]:
# Load Twitter data
path, dirs, files = next(os.walk("data/"))
df_list = []
  
# Pre-process
for file in files:
    print("Working on:", file)
    raw_df = pd.read_csv("data/" + file)
    clean_df = preprocess_data(raw_df)
    df_list.append(clean_df)

Working on: tweets_9.csv
Working on: tweets_8.csv
Working on: tweets_29.csv
Working on: tweets_15.csv
Working on: tweets_14.csv
Working on: tweets_28.csv
Working on: tweets_16.csv
Working on: tweets_17.csv
Working on: tweets_13.csv
Working on: tweets_12.csv
Working on: tweets_10.csv
Working on: tweets_38.csv
Working on: tweets_39.csv
Working on: tweets_11.csv
Working on: tweets_20.csv
Working on: tweets_34.csv
Working on: tweets_35.csv
Working on: tweets_21.csv
Working on: tweets_37.csv
Working on: tweets_23.csv
Working on: tweets_22.csv
Working on: tweets_36.csv
Working on: tweets_32.csv
Working on: tweets_26.csv
Working on: tweets_27.csv
Working on: tweets_33.csv
Working on: tweets_25.csv
Working on: tweets_31.csv
Working on: tweets_19.csv
Working on: tweets_18.csv
Working on: tweets_30.csv
Working on: tweets_24.csv
Working on: tweets_3.csv
Working on: tweets_2.csv
Working on: tweets_40.csv
Working on: tweets_1.csv
Working on: tweets_5.csv
Working on: tweets_4.csv
Working on: tweets_

In [4]:
# Append all the dataframes
main_df = pd.concat([df for df in df_list], ignore_index=True)

In [5]:
main_df.shape

(246014, 19)