## This notebook combines all the raw dirty datasets from Data-dirty folder, drops extra columns from it, remove duplicates and NaN rows, add useful columns,etc. and then saves the cleaned dataset as one csv file.

In [29]:
import pandas as pd
import numpy as np
import os
import string
import nltk
from nltk.corpus import stopwords

## 1. First, we combine all dirty datasets from the files for all months and years in the Dirty-data folder

In [30]:
path = './Data-dirty'
files = os.listdir(path)

In [31]:
# Append dirty data from all available months and years
df = pd.DataFrame()
for name in files:
    tmp = pd.read_csv(os.path.join(path, name))
    df = df.append(tmp)

## 2. Now we drop duplicate rows, drop rows with NaN for tweets_Date. We create two extra binary columns, and then drop all other redundant columns. 

In [32]:
# First drop duplicate rows:
df.drop_duplicates()


# Next, drop rows which have tweets_Date == NaN
df.dropna(subset=['tweets_Date'], inplace=True)        


# Then drop irrelevant columns as follow:

df = df.drop(labels=['listing_value', 'tweets_Name', 'remove'], axis=1).reset_index(drop=True)


# Now Add a binary column showing whether a tweet has image and/or external links
df['has_image'] = (pd.notnull(df['tweets_Images'])).astype(int)
df['has_Link'] = (pd.notnull(df['tweets_hrefURL'])).astype(int)

# Next, drop columns tweets_Images, tweets_imageURL, tweets_hrefURL, Unnamed
df = df.drop(labels=['tweets_Images', 'tweets_imageURL', 'tweets_hrefURL', 'Unnamed: 0'], axis=1).reset_index(drop=True)


In [33]:
df

Unnamed: 0,tweets_Date,tweets_Text,tweets_Replies,tweets_Retweets,tweets_Likes,has_image,has_Link
0,May 31,"Christo Vladimirov Javacheff, who was known fo...",78,550,1.3K,1,1
1,May 31,More than 20 cities have imposed curfews Sunda...,238,353,1.2K,0,0
2,May 31,US astronauts Robert Behnken and Douglas Hurle...,86,447,2.5K,1,1
3,May 31,Trump announces the US will designate Antifa a...,1.3K,1.2K,2.9K,1,0
4,May 31,NASA astronauts aboard the SpaceX Crew Dragon ...,60,464,2.6K,1,1
...,...,...,...,...,...,...,...
9939,Oct 1,"Another 837,000 Americans filed for first-time...",68,203,447,1,1
9940,Oct 1,The EU has launched legal proceedings against ...,114,183,584,1,1
9941,Oct 1,Japan's Tokyo Stock Exchange halted all tradin...,70,253,362,1,1
9942,Oct 1,Brad Parscale has stepped down from his role a...,367,534,1.8K,1,1


## 3. We standardize the number of likes, etc. Then we also clean up tweet texts.

In [34]:
# Method to standardize numbers ==> example: 4k should be 4000
def standardized_replies_retweets_likes(df):
    for i in range(len(df)):
        if 'K' in str(df.at[i,'tweets_Replies']):
            temp = df.at[i,'tweets_Replies'].replace('K','')
            df.at[i,'tweets_Replies'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Retweets']):
            temp = df.at[i,'tweets_Retweets'].replace('K','')
            df.at[i,'tweets_Retweets'] = float(temp)*1000
        if 'K' in str(df.at[i,'tweets_Likes']):
            temp = df.at[i,'tweets_Likes'].replace('K','')
            df.at[i,'tweets_Likes'] = float(temp)*1000

    return df

In [35]:
df = standardized_replies_retweets_likes(df)

In [36]:
df.head()

Unnamed: 0,tweets_Date,tweets_Text,tweets_Replies,tweets_Retweets,tweets_Likes,has_image,has_Link
0,May 31,"Christo Vladimirov Javacheff, who was known fo...",78,550,1300,1,1
1,May 31,More than 20 cities have imposed curfews Sunda...,238,353,1200,0,0
2,May 31,US astronauts Robert Behnken and Douglas Hurle...,86,447,2500,1,1
3,May 31,Trump announces the US will designate Antifa a...,1300,1200,2900,1,0
4,May 31,NASA astronauts aboard the SpaceX Crew Dragon ...,60,464,2600,1,1


In [40]:
def text_cleaning(data):
    #lower case
    data['tweets_Text']=data['tweets_Text'].str.lower()
    
    #remove punctuation
    punc_to_remove = string.punctuation
    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', punc_to_remove))
    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_punctuation(text))
    
    #remove stop words
    ", ".join(stopwords.words('english'))
    STOPWORDS = set(stopwords.words('english'))
    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in STOPWORDS])

    data['tweets_Text'] = data['tweets_Text'].apply(lambda text: remove_stopwords(text))

    return data


    

In [41]:
df = text_cleaning(df)

In [42]:
df.head()

Unnamed: 0,tweets_Date,tweets_Text,tweets_Replies,tweets_Retweets,tweets_Likes,has_image,has_Link
0,May 31,christo vladimirov javacheff known monumental ...,78,550,1300,1,1
1,May 31,20 cities imposed curfews sunday night respons...,238,353,1200,0,0
2,May 31,us astronauts robert behnken douglas hurley su...,86,447,2500,1,1
3,May 31,trump announces us designate antifa terrorist ...,1300,1200,2900,1,0
4,May 31,nasa astronauts aboard spacex crew dragon spac...,60,464,2600,1,1


## 4. Now categorize the values of tweets_Replies column and add a new column which presents the categorical variable: ADD CODE HERE FOR CATEGORIES

## 5. Drop tweets_Date, tweets_Replies, tweets_Retweets, and tweets_Likes and save the final dataset as a csv file. Please indicate in the name of csv weather it has been preprocessed using this notebook.
## We can save two versions, one with the above columns dropped and one including these columns just in case. Please mention in the name which is version is what along with time range.