## This notebook combines all the raw dirty datasets from Data-dirty folder, drops extra columns from it, remove duplicates and NaN rows, add useful columns,etc. and then saves the cleaned dataset as one csv file.

In [1]:
import pandas as pd
import numpy as np
import os
import string
import nltk
from nltk.corpus import stopwords

## 1. First, we combine all dirty datasets from the files for all months and years in the Dirty-data folder

In [35]:
path = './Data-new/Data-dirty'
files = os.listdir(path)

In [36]:
# Append dirty data from all available months and years
df = pd.DataFrame()
for name in files:
    tmp = pd.read_csv(os.path.join(path, name))
    df = df.append(tmp)

In [37]:
len(df)

77597

## 2. Now we drop duplicate rows, drop rows with NaN for tweets_Date. We create two extra binary columns, and then drop all other redundant columns. 

In [38]:
# First drop duplicate rows:
df.drop_duplicates()


# Next, drop rows which have tweets_Date == NaN
df.dropna(subset=['tweets_Date'], inplace=True) 
df.dropna(subset=['tweets_Text'], inplace=True)
df.dropna(subset=['tweets_Replies'], inplace=True)


# Then drop irrelevant columns as follow:

df = df.drop(labels=['listing_value', 'tweets_Name', 'remove'], axis=1).reset_index(drop=True)


# Now Add a binary column showing whether a tweet has image and/or external links
df['has_image'] = (pd.notnull(df['tweets_Images'])).astype(int)
df['has_Link'] = (pd.notnull(df['tweets_hrefURL'])).astype(int)

# Next, drop columns tweets_Images, tweets_imageURL, tweets_hrefURL, Unnamed
df = df.drop(labels=['tweets_Images', 'tweets_imageURL', 'tweets_hrefURL'], axis=1).reset_index(drop=True)


In [39]:
len(df)

46761

In [40]:
df

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Dec 31, 2019",292,30,97,"Syd Mead, the visionary artist who helped shap...",1,0
1,"Dec 31, 2019",494,178,193,The US is sending additional forces to protect...,1,0
2,"Dec 31, 2019",205,28,74,"Book publishing legend Sonny Mehta, editor-in-...",1,0
3,"Dec 31, 2019",319,165,128,President Trump weighs in on protesters' attem...,1,0
4,"Dec 31, 2019",198,77,101,Pro-Iranian protesters tried to storm the US e...,1,0
...,...,...,...,...,...,...,...
46756,"Nov 17, 2015",208,23,158,"Before Paris attacks, France and allies tried ...",0,0
46757,"Nov 17, 2015",519,108,409,Porsche blames actor Paul Walker for his own d...,0,0
46758,"Nov 17, 2015",526,56,935,Russia says the passenger jet that crashed ove...,0,0
46759,"Nov 17, 2015",336,27,498,Yakuza boss of Japan's largest gang syndicate ...,1,1


## 3. We standardize the number of likes, etc. Then we also clean up tweet texts.

In [41]:
# Method to standardize numbers ==> example: 4k should be 4000
def standardized_replies_retweets_likes(df):
    for i in range(len(df)):
        if 'K' in str(df.at[i,'tweets_Replies']):
            temp = df.at[i,'tweets_Replies'].replace('K','')
            df.at[i,'tweets_Replies'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Retweets']):
            temp = df.at[i,'tweets_Retweets'].replace('K','')
            df.at[i,'tweets_Retweets'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Likes']):
            temp = df.at[i,'tweets_Likes'].replace('K','')
            df.at[i,'tweets_Likes'] = float(temp)*1000

        df.at[i,'tweets_Replies'] = float(df.at[i,'tweets_Replies'])
        df.at[i,'tweets_Retweets'] = float(df.at[i,'tweets_Retweets'])
        df.at[i,'tweets_Likes'] = float(df.at[i,'tweets_Likes'])
            
    return df

In [42]:
df = standardized_replies_retweets_likes(df)

In [43]:
df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Dec 31, 2019",292,30,97,"Syd Mead, the visionary artist who helped shap...",1,0
1,"Dec 31, 2019",494,178,193,The US is sending additional forces to protect...,1,0
2,"Dec 31, 2019",205,28,74,"Book publishing legend Sonny Mehta, editor-in-...",1,0
3,"Dec 31, 2019",319,165,128,President Trump weighs in on protesters' attem...,1,0
4,"Dec 31, 2019",198,77,101,Pro-Iranian protesters tried to storm the US e...,1,0


In [44]:
def text_cleaning(data):
    #lower case
    data['tweets_Text']=data['tweets_Text'].str.lower()
    
    #remove punctuation
    punc_to_remove = string.punctuation
    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', punc_to_remove))
    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_punctuation(text))
    
    #remove stop words
    ", ".join(stopwords.words('english'))
    STOPWORDS = set(stopwords.words('english'))
    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in STOPWORDS])

    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_stopwords(text))

    return data


In [45]:
df = text_cleaning(df)

In [46]:
print('Min Length of Tweet Text: ', df.tweets_Text.map(lambda x: len(x)).min())
print('Mean Length of Tweet Text: ', df.tweets_Text.map(lambda x: len(x)).mean())
print('Max Length of Tweet Text: ', df.tweets_Text.map(lambda x: len(x)).max())

Min Length of Tweet Text:  3
Mean Length of Tweet Text:  92.55007377943157
Max Length of Tweet Text:  246


In [47]:
df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Dec 31, 2019",292,30,97,syd mead visionary artist helped shape look bl...,1,0
1,"Dec 31, 2019",494,178,193,us sending additional forces protect embassy b...,1,0
2,"Dec 31, 2019",205,28,74,book publishing legend sonny mehta editorinchi...,1,0
3,"Dec 31, 2019",319,165,128,president trump weighs protesters attempts sto...,1,0
4,"Dec 31, 2019",198,77,101,proiranian protesters tried storm us embassy b...,1,0


In [48]:
df.dropna(subset=['tweets_Text'], inplace=True) 

In [49]:
df

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,"Dec 31, 2019",292,30,97,syd mead visionary artist helped shape look bl...,1,0
1,"Dec 31, 2019",494,178,193,us sending additional forces protect embassy b...,1,0
2,"Dec 31, 2019",205,28,74,book publishing legend sonny mehta editorinchi...,1,0
3,"Dec 31, 2019",319,165,128,president trump weighs protesters attempts sto...,1,0
4,"Dec 31, 2019",198,77,101,proiranian protesters tried storm us embassy b...,1,0
...,...,...,...,...,...,...,...
46756,"Nov 17, 2015",208,23,158,paris attacks france allies tried target isis ...,0,0
46757,"Nov 17, 2015",519,108,409,porsche blames actor paul walker death one spo...,0,0
46758,"Nov 17, 2015",526,56,935,russia says passenger jet crashed egypt octobe...,0,0
46759,"Nov 17, 2015",336,27,498,yakuza boss japans largest gang syndicate foun...,1,1


## 4. Now categorize the values of tweets_Replies column and add a new column which presents the categorical variable: ADD CODE HERE FOR CATEGORIES

In [50]:
df['tweets_ResponseCategory'], bins = pd.qcut(df['tweets_Replies'], 3, labels=[0, 1, 2], retbins = True)


In [51]:
df.groupby('tweets_ResponseCategory').size()

tweets_ResponseCategory
0    15810
1    15423
2    15528
dtype: int64

## 5. Remove the NaNs and index columns

In [52]:
df.dropna(subset=['tweets_Text'], inplace=True)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link,tweets_ResponseCategory
0,"Dec 31, 2019",292,30,97,syd mead visionary artist helped shape look bl...,1,0,1
1,"Dec 31, 2019",494,178,193,us sending additional forces protect embassy b...,1,0,2
2,"Dec 31, 2019",205,28,74,book publishing legend sonny mehta editorinchi...,1,0,1
3,"Dec 31, 2019",319,165,128,president trump weighs protesters attempts sto...,1,0,2
4,"Dec 31, 2019",198,77,101,proiranian protesters tried storm us embassy b...,1,0,2


## 5. Drop tweets_Date, tweets_Replies, tweets_Retweets, and tweets_Likes and save the final dataset as a csv file. Please indicate in the name of csv weather it has been preprocessed using this notebook.
## We can save two versions, one with the above columns dropped and one including these columns just in case. Please mention in the name which is version is what along with time range.

In [53]:
df.to_csv('./Data-new/Data-clean/2011-2020_all_cols.csv', index = False)

In [54]:
df.drop(['tweets_Date', 'tweets_Likes', 'tweets_Replies', 'tweets_Retweets'], axis=1, inplace=True)

In [55]:
df.head()

Unnamed: 0,tweets_Text,has_image,has_Link,tweets_ResponseCategory
0,syd mead visionary artist helped shape look bl...,1,0,1
1,us sending additional forces protect embassy b...,1,0,2
2,book publishing legend sonny mehta editorinchi...,1,0,1
3,president trump weighs protesters attempts sto...,1,0,2
4,proiranian protesters tried storm us embassy b...,1,0,2


In [56]:
df.to_csv('./Data-new/Data-clean/2011-2020_short.csv', index = False)