## This notebook combines all the raw dirty datasets from Data-dirty folder, drops extra columns from it, remove duplicates and NaN rows, add useful columns,etc. and then saves the cleaned dataset as one csv file.

In [43]:
import pandas as pd
import numpy as np
import os
import string
import nltk
from nltk.corpus import stopwords

## 1. First, we combine all dirty datasets from the files for all months and years in the Dirty-data folder

In [44]:
path = './Data-dirty'
files = os.listdir(path)

In [45]:
# Append dirty data from all available months and years
df = pd.DataFrame()
for name in files:
    tmp = pd.read_csv(os.path.join(path, name))
    df = df.append(tmp)

In [46]:
len(df)

35835

## 2. Now we drop duplicate rows, drop rows with NaN for tweets_Date. We create two extra binary columns, and then drop all other redundant columns. 

In [47]:
# First drop duplicate rows:
df.drop_duplicates()


# Next, drop rows which have tweets_Date == NaN
df.dropna(subset=['tweets_Date'], inplace=True) 
df.dropna(subset=['tweets_Text'], inplace=True) 


# Then drop irrelevant columns as follow:

df = df.drop(labels=['listing_value', 'tweets_Name', 'remove'], axis=1).reset_index(drop=True)


# Now Add a binary column showing whether a tweet has image and/or external links
df['has_image'] = (pd.notnull(df['tweets_Images'])).astype(int)
df['has_Link'] = (pd.notnull(df['tweets_hrefURL'])).astype(int)

# Next, drop columns tweets_Images, tweets_imageURL, tweets_hrefURL, Unnamed
df = df.drop(labels=['tweets_Images', 'tweets_imageURL', 'tweets_hrefURL', 'Unnamed: 0'], axis=1).reset_index(drop=True)


In [49]:
df

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Jan 28, 2016",276,49,147,.,1,0
1,"Jan 28, 2016",243,33,288,.,1,0
2,"Jan 28, 2016",180,17,202,Police: Hunt still on for woman who was with a...,1,1
3,"Jan 28, 2016",291,37,360,French police arrest woman who was with man ca...,1,1
4,"Jan 28, 2016",202,22,240,French police hunting woman who was with an ar...,1,1
...,...,...,...,...,...,...,...
20524,Oct 1,447,68,203,"Another 837,000 Americans filed for first-time...",1,1
20525,Oct 1,584,114,183,The EU has launched legal proceedings against ...,1,1
20526,Oct 1,362,70,253,Japan's Tokyo Stock Exchange halted all tradin...,1,1
20527,Oct 1,1.8K,367,534,Brad Parscale has stepped down from his role a...,1,1


## 3. We standardize the number of likes, etc. Then we also clean up tweet texts.

In [50]:
# Method to standardize numbers ==> example: 4k should be 4000
def standardized_replies_retweets_likes(df):
    for i in range(len(df)):
        if 'K' in str(df.at[i,'tweets_Replies']):
            temp = df.at[i,'tweets_Replies'].replace('K','')
            df.at[i,'tweets_Replies'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Retweets']):
            temp = df.at[i,'tweets_Retweets'].replace('K','')
            df.at[i,'tweets_Retweets'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Likes']):
            temp = df.at[i,'tweets_Likes'].replace('K','')
            df.at[i,'tweets_Likes'] = float(temp)*1000

        df.at[i,'tweets_Replies'] = float(df.at[i,'tweets_Replies'])
        df.at[i,'tweets_Retweets'] = float(df.at[i,'tweets_Retweets'])
        df.at[i,'tweets_Likes'] = float(df.at[i,'tweets_Likes'])
            
    return df

In [51]:
df = standardized_replies_retweets_likes(df)

In [52]:
df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Jan 28, 2016",276,49,147,.,1,0
1,"Jan 28, 2016",243,33,288,.,1,0
2,"Jan 28, 2016",180,17,202,Police: Hunt still on for woman who was with a...,1,1
3,"Jan 28, 2016",291,37,360,French police arrest woman who was with man ca...,1,1
4,"Jan 28, 2016",202,22,240,French police hunting woman who was with an ar...,1,1


In [53]:
def text_cleaning(data):
    #lower case
    data['tweets_Text']=data['tweets_Text'].str.lower()
    
    #remove punctuation
    punc_to_remove = string.punctuation
    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', punc_to_remove))
    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_punctuation(text))
    
    #remove stop words
    ", ".join(stopwords.words('english'))
    STOPWORDS = set(stopwords.words('english'))
    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in STOPWORDS])

    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_stopwords(text))

    return data


    

In [54]:
df = text_cleaning(df)

In [55]:
df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Jan 28, 2016",276,49,147,,1,0
1,"Jan 28, 2016",243,33,288,,1,0
2,"Jan 28, 2016",180,17,202,police hunt still woman armed man near disneyl...,1,1
3,"Jan 28, 2016",291,37,360,french police arrest woman man carrying weapon...,1,1
4,"Jan 28, 2016",202,22,240,french police hunting woman armed man arrested...,1,1


In [56]:
df.dropna(subset=['tweets_Text'], inplace=True) 

In [57]:
df

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Jan 28, 2016",276,49,147,,1,0
1,"Jan 28, 2016",243,33,288,,1,0
2,"Jan 28, 2016",180,17,202,police hunt still woman armed man near disneyl...,1,1
3,"Jan 28, 2016",291,37,360,french police arrest woman man carrying weapon...,1,1
4,"Jan 28, 2016",202,22,240,french police hunting woman armed man arrested...,1,1
...,...,...,...,...,...,...,...
20524,Oct 1,447,68,203,another 837000 americans filed firsttime unemp...,1,1
20525,Oct 1,584,114,183,eu launched legal proceedings uk failed withdr...,1,1
20526,Oct 1,362,70,253,japans tokyo stock exchange halted trading tec...,1,1
20527,Oct 1,1800,367,534,brad parscale stepped role senior adviser pres...,1,1


## 4. Now categorize the values of tweets_Replies column and add a new column which presents the categorical variable: ADD CODE HERE FOR CATEGORIES

In [58]:
df['tweets_ResponseCategory'], bins = pd.qcut(df['tweets_Replies'], 3, labels=[0, 1, 2], retbins = True)


In [59]:
df.groupby('tweets_ResponseCategory').size()

tweets_ResponseCategory
0    6966
1    6748
2    6815
dtype: int64

## 5. Remove the NaNs and index columns

In [64]:
df.dropna(subset=['tweets_Text'], inplace=True)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link,tweets_ResponseCategory
0,"Jan 28, 2016",276,49,147,,1,0,0
1,"Jan 28, 2016",243,33,288,,1,0,0
2,"Jan 28, 2016",180,17,202,police hunt still woman armed man near disneyl...,1,1,0
3,"Jan 28, 2016",291,37,360,french police arrest woman man carrying weapon...,1,1,0
4,"Jan 28, 2016",202,22,240,french police hunting woman armed man arrested...,1,1,0


## 5. Drop tweets_Date, tweets_Replies, tweets_Retweets, and tweets_Likes and save the final dataset as a csv file. Please indicate in the name of csv weather it has been preprocessed using this notebook.
## We can save two versions, one with the above columns dropped and one including these columns just in case. Please mention in the name which is version is what along with time range.

In [15]:
df.to_csv('./Data-clean/Preprocessed_2015-2020_all_cols.csv', index = False)

In [16]:
df.drop(['tweets_Date', 'tweets_Likes', 'tweets_Replies', 'tweets_Retweets'], axis=1, inplace=True)

In [17]:
df.head()

Unnamed: 0,tweets_Text,has_image,has_Link,tweets_ResponseCategory
0,,1,0,0
1,,1,0,0
2,police hunt still woman armed man near disneyl...,1,1,0
3,french police arrest woman man carrying weapon...,1,1,0
4,french police hunting woman armed man arrested...,1,1,0


In [18]:
df.to_csv('./Data-clean/Preprocessed_2015-2020_short.csv', index = False)