## This notebook combines all the raw dirty datasets from Data-dirty folder, drops extra columns from it, remove duplicates and NaN rows, add useful columns,etc. and then saves the cleaned dataset as one csv file.

In [1]:
import pandas as pd
import numpy as np
import os
import string
import nltk
from nltk.corpus import stopwords

## 1. First, we combine all dirty datasets from the files for all months and years in the Dirty-data folder

In [2]:
path = './Data-dirty'
files = os.listdir(path)

In [3]:
# Append dirty data from all available months and years
df = pd.DataFrame()
for name in files:
    tmp = pd.read_csv(os.path.join(path, name))
    df = df.append(tmp)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


## 2. Now we drop duplicate rows, drop rows with NaN for tweets_Date. We create two extra binary columns, and then drop all other redundant columns. 

In [4]:
# First drop duplicate rows:
df.drop_duplicates()


# Next, drop rows which have tweets_Date == NaN
df.dropna(subset=['tweets_Date'], inplace=True)        


# Then drop irrelevant columns as follow:

df = df.drop(labels=['listing_value', 'tweets_Name', 'remove'], axis=1).reset_index(drop=True)


# Now Add a binary column showing whether a tweet has image and/or external links
df['has_image'] = (pd.notnull(df['tweets_Images'])).astype(int)
df['has_Link'] = (pd.notnull(df['tweets_hrefURL'])).astype(int)

# Next, drop columns tweets_Images, tweets_imageURL, tweets_hrefURL, Unnamed
df = df.drop(labels=['tweets_Images', 'tweets_imageURL', 'tweets_hrefURL', 'Unnamed: 0'], axis=1).reset_index(drop=True)


In [5]:
df

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,Sep 30,424,96,306,Initial reports suggest three rockets struck a...,0,0
1,Sep 30,341,164,138,The Senate has approved a stopgap spending bil...,1,1
2,Sep 30,296,88,191,Seven people were shot at a funeral home in Mi...,1,1
3,Sep 30,651,64,237,"Clare Bronfman, heiress to the Seagram liquor ...",1,1
4,Sep 30,510,491,155,Trump plans to participate in the next two pre...,1,0
...,...,...,...,...,...,...,...
9939,Oct 1,447,68,203,"Another 837,000 Americans filed for first-time...",1,1
9940,Oct 1,584,114,183,The EU has launched legal proceedings against ...,1,1
9941,Oct 1,362,70,253,Japan's Tokyo Stock Exchange halted all tradin...,1,1
9942,Oct 1,1.8K,367,534,Brad Parscale has stepped down from his role a...,1,1


## 3. We standardize the number of likes, etc. Then we also clean up tweet texts.

In [6]:
# Method to standardize numbers ==> example: 4k should be 4000
def standardized_replies_retweets_likes(df):
    for i in range(len(df)):
        if 'K' in str(df.at[i,'tweets_Replies']):
            temp = df.at[i,'tweets_Replies'].replace('K','')
            df.at[i,'tweets_Replies'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Retweets']):
            temp = df.at[i,'tweets_Retweets'].replace('K','')
            df.at[i,'tweets_Retweets'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Likes']):
            temp = df.at[i,'tweets_Likes'].replace('K','')
            df.at[i,'tweets_Likes'] = float(temp)*1000

        df.at[i,'tweets_Replies'] = float(df.at[i,'tweets_Replies'])
        df.at[i,'tweets_Retweets'] = float(df.at[i,'tweets_Retweets'])
        df.at[i,'tweets_Likes'] = float(df.at[i,'tweets_Likes'])
            
    return df

In [7]:
df = standardized_replies_retweets_likes(df)

In [8]:
df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,Sep 30,424,96,306,Initial reports suggest three rockets struck a...,0,0
1,Sep 30,341,164,138,The Senate has approved a stopgap spending bil...,1,1
2,Sep 30,296,88,191,Seven people were shot at a funeral home in Mi...,1,1
3,Sep 30,651,64,237,"Clare Bronfman, heiress to the Seagram liquor ...",1,1
4,Sep 30,510,491,155,Trump plans to participate in the next two pre...,1,0


In [9]:
def text_cleaning(data):
    #lower case
    data['tweets_Text']=data['tweets_Text'].str.lower()
    
    #remove punctuation
    punc_to_remove = string.punctuation
    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', punc_to_remove))
    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_punctuation(text))
    
    #remove stop words
    ", ".join(stopwords.words('english'))
    STOPWORDS = set(stopwords.words('english'))
    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in STOPWORDS])

    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_stopwords(text))

    return data


    

In [10]:
df = text_cleaning(df)

In [11]:
df.head()

Unnamed: 0,tweets_Date,tweets_Likes,tweets_Replies,tweets_Retweets,tweets_Text,has_image,has_Link
0,Sep 30,424,96,306,initial reports suggest three rockets struck b...,0,0
1,Sep 30,341,164,138,senate approved stopgap spending bill order av...,1,1
2,Sep 30,296,88,191,seven people shot funeral home milwaukee treat...,1,1
3,Sep 30,651,64,237,clare bronfman heiress seagram liquor fortune ...,1,1
4,Sep 30,510,491,155,trump plans participate next two presidential ...,1,0


## 4. Now categorize the values of tweets_Replies column and add a new column which presents the categorical variable: ADD CODE HERE FOR CATEGORIES

In [13]:
df['tweets_ResponseCategory'] = pd.qcut(df['tweets_Replies'], 3, labels=[0, 1, 2])


In [14]:
df.groupby('tweets_ResponseCategory').size()

tweets_ResponseCategory
0    3344
1    3304
2    3296
dtype: int64

## 5. Drop tweets_Date, tweets_Replies, tweets_Retweets, and tweets_Likes and save the final dataset as a csv file. Please indicate in the name of csv weather it has been preprocessed using this notebook.
## We can save two versions, one with the above columns dropped and one including these columns just in case. Please mention in the name which is version is what along with time range.

In [16]:
df.to_csv('./Data-clean/Preprocessed_with_all_cols.csv')

In [18]:
df.drop(['tweets_Date', 'tweets_Likes', 'tweets_Replies', 'tweets_Retweets'], axis=1, inplace=True)

In [19]:
df.to_csv('./Data-clean/Preprocessed_only_txt_has_image_has_link.csv')