In [1]:
import pandas as pd
import pickle
import re
import os
from collections import defaultdict
from time import time
import csv
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
#importing list of slang words

In [2]:
slang_dict = {}

for _row in csv.reader(open("slang.txt", "r"), delimiter = "="):
    if _row != "":
        slang_dict[_row[0]] = _row[1]

In [5]:
#defining a list of punctuation symbols

In [3]:
punctuation_list = [",", ".", "!", "?", "$", "%", "^", "&", "*", ")", "(", "=", \
                    ":", "|", ";", "<", ">", "~", "‘", "'", "/", "\\", "’", "-"]

In [7]:
# defining a class that goes through all csv files(1 file per day) within a folder, applies all 
#the cleaning functionswithin it and saves the cleaned tweets in a dataframe, saves each day's dataframe
#in a pickled file in a folder title "cleaned_folder" 

In [4]:
class tweet_cleaner:
    
    def __init__(self, _file, _folder, save_folder, necessary_columns = ["id", "conversation_id", "date", "time", \
                                                            "user_id", "tweet", "retweets_count",\
                                                            "replies_count", "likes_count", "retweet"]):
        self.folder = _folder
        self.file = _file
        self.save_folder = save_folder
        self.necessary_columns = necessary_columns
        self.df = pd.read_csv(os.path.join(os.path.abspath(self.folder), self.file))
        self.cleaned_df = self.clean_tweet()
        self.output = self.save_output()
        
        
    
    @staticmethod
    #find the share of capital letters used in a tweet
    def find_caps_share(tweet):
        caps_length = len(re.findall(r"[A-Z]", tweet))
        tweet_length = len(re.sub(r"\s", "", tweet))
        
        return caps_length/(1 + tweet_length)
    
    @staticmethod
    #replace contractions such as "can't", "won't" etc with "cannot", "will not"
    def remove_contractions(tweet):
        tweet = tweet.lower()
        tweet = re.sub(r"can\s?'\s?t", "cannot", tweet)
        tweet = re.sub(r"won\s?'\s?t", "will not", tweet)
        tweet = re.sub(r"n\s?'\s?t", " not", tweet)
        tweet = re.sub(r"'\s?ve", " have", tweet)
        tweet = re.sub(r"'\s?ll", " will", tweet)
        tweet = re.sub(r"'\s?re", " are", tweet)
        tweet = re.sub(r"'\s?m", " am", tweet)
        tweet = re.sub(r"'\s?d", " would", tweet)
        
        tweet = re.sub(r"can\s?‘\s?t", "cannot", tweet)
        tweet = re.sub(r"won\s?‘\s?t", "will not", tweet)
        tweet = re.sub(r"n\s?‘\s?t", " not", tweet)
        tweet = re.sub(r"‘\s?ve", " have", tweet)
        tweet = re.sub(r"‘\s?ll", " will", tweet)
        tweet = re.sub(r"‘\s?re", " are", tweet)
        tweet = re.sub(r"‘\s?m", " am", tweet)
        tweet = re.sub(r"‘\s?d", " would", tweet)
        
        tweet = re.sub(r"can\s?’\s?t", "cannot", tweet)
        tweet = re.sub(r"won\s?’\s?t", "will not", tweet)
        tweet = re.sub(r"n\s?’\s?t", " not", tweet)
        tweet = re.sub(r"’\s?ve", " have", tweet)
        tweet = re.sub(r"’\s?ll", " will", tweet)
        tweet = re.sub(r"’\s?re", " are", tweet)
        tweet = re.sub(r"’\s?m", " am", tweet)
        tweet = re.sub(r"’\s?d", " would", tweet)
        
        return " ".join(tweet.split())
    
    @staticmethod
    #remove all the punctuation symbols defined before from the tweet
    def remove_punctuation(tweet):
        for _punct in punctuation_list:
            tweet = tweet.replace(_punct, " ")
        return " ".join(tweet.split())
    
    @staticmethod
    #return the hashtag used in the tweet
    def tweet_hashtag(tweet):
        splits = tweet.split()
        splits = [split[1:] for split in splits if split[0] == "#"]
        return " ".join(splits)
    
    @staticmethod
    #return all users mentioned in the tweet
    def tweet_target(tweet):
        splits = tweet.split()
        splits = [split[1:] for split in splits if split[0] == "@"]
        return " ".join(splits)
    
    @staticmethod
    #remove any users mentioned in the tweet
    def remove_targets(tweet):
        tweet = re.sub(r"@_*\w+(_+\w+)*_*", " ", tweet)
        return " ".join(tweet.split())
    
    @staticmethod
    #remove any hashtags in the tweet
    def remove_hashtags(tweet):
        tweet = re.sub(r"#\w+", " ", tweet)
        return " ".join(tweet.split())
    
    @staticmethod
    #remove all urls in the tweet
    def remove_url(tweet):
        tweet = re.sub(r"https?://t.co/\w+", " ", tweet)
        return " ".join(tweet.split())
    
    @staticmethod
    #identify all emojis used in the tweet, find their meaning in string format, return a string made up of all emoji meanings
    def find_emojis(tweet):
        emojis = []
        for _emoji in UNICODE_EMOJI:
            if _emoji in tweet:
                for i in range(len((re.findall(_emoji, tweet)))):
                    emojis.append(UNICODE_EMOJI[_emoji].replace(":", ""))
            else:
                pass
        
        return " ".join(emojis)
    
    @staticmethod
    #remove all emojis in the tweet
    def remove_emojis(tweet):
        for _emoji in UNICODE_EMOJI:
            tweet = tweet.replace(_emoji, " ")
        return " ".join(tweet.split())
    
    @staticmethod
    #replace all slang words in the tweet by their meaning using previously imported list
    def remove_slang(tweet):
        new_text = []
        for _word in tweet.split():
            if _word.upper() in slang_dict.keys():
                new_text.append(slang_dict[_word.upper()])
            else:
                new_text.append(_word)
        
        return " ".join(new_text)
    
    @staticmethod
    #remove remaning punctuation marks
    def make_alphabetic(tweet):
        tweet = tweet.replace("…", " ")
        tweet = tweet.replace("_", " ")
        tweet = tweet.replace("@", " ")
        tweet = tweet.replace("#", " ")
        return " ".join([_word for _word in tweet.split() if _word.isalpha() and len(_word) > 1 \
                        and _word != "rt"])
    
    #create dataframe out of day's data. create the new columns "cleaned_tweet", "caps_share", "target", "hashtag", 
    #"emojis" 
    def clean_tweet(self):
        _cleaned = self.df.copy()
        _cleaned = _cleaned.where(_cleaned.language == "en").dropna(subset = ["language"])
        _cleaned.reset_index(drop = True, inplace = True)
        _cleaned = _cleaned[self.necessary_columns]
        _cleaned["caps_share"] = _cleaned["tweet"].apply(self.find_caps_share)
        _cleaned["cleaned_tweet"] = _cleaned["tweet"].apply(self.remove_contractions)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.remove_url)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.remove_punctuation)
        _cleaned["target"] = _cleaned["cleaned_tweet"].apply(self.tweet_target)
        _cleaned["hashtag"] = _cleaned["cleaned_tweet"].apply(self.tweet_hashtag)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.remove_targets)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.remove_hashtags)
        _cleaned["emojis"] = _cleaned["cleaned_tweet"].apply(self.find_emojis)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.remove_emojis)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.remove_slang)
        _cleaned["cleaned_tweet"] = _cleaned["cleaned_tweet"].apply(self.make_alphabetic)
        _cleaned.dropna(subset = ["cleaned_tweet"], inplace = True)
        _cleaned.reset_index(drop = True, inplace = True)
        return _cleaned
    
    #save the dataframe as pickled file titled "cleaned filename" in a new folder titled "cleaned foldername" with 
    def save_output(self):
        try:
            state_name = self.folder.rpartition("\\")[-1]
            dirname = "cleaned {}".format(state_name)
            folder_name = self.save_folder + "\\" +  dirname
            os.mkdir(folder_name)
            print("Folder {} created".format(dirname))
        except FileExistsError:
            pass
        with open(os.path.join(os.path.abspath(folder_name), "cleaned {}.pkl".format(self.file)), "wb") as f:
            pickle.dump(self.cleaned_df, f)

In [9]:
#running a loop through all states in the Raw Data folder and saving the cleaned output state-wise in the 
#"Cleaned Data" folder

In [6]:
t = time()
for _folder in os.listdir("Raw Data"):
    t_1 = time()
    file_list = [_file for _file in os.listdir("Raw Data\\{}".format(_folder)) if ".csv" in _file]
    for _file in file_list:
        tweet_cleaner(_file, os.path.join("Raw Data", _folder), "Cleaned Data")
    print("{} completed".format(_folder))
    print("time taken: {} mins".format(round((time() - t_1)/60, 2)))
print("time taken: {} mins".format(round((time() - t)/60, 2)))

Folder cleaned Andhra_Pradesh created
Andhra_Pradesh completed
time taken: 3.2 mins


NameError: name 't' is not defined

In [7]:
test = pickle.load(open("Cleaned Data\\cleaned Andhra_Pradesh\\cleaned 2020-01-22.csv.pkl", "rb"))

In [9]:
test.shape

(1345, 15)

In [10]:
test.head()

Unnamed: 0,id,conversation_id,date,time,user_id,tweet,retweets_count,replies_count,likes_count,retweet,caps_share,cleaned_tweet,target,hashtag,emojis
0,1.220133e+18,1.220133e+18,2020-01-23,00:57:35,63411110.0,Follow @tokslabossmua on YouTube: thanks so mu...,0.0,0.0,0.0,False,0.086957,follow on youtube thanks so much dossier perfu...,tokslabossmua,,
1,1.220077e+18,1.220077e+18,2020-01-22,21:12:47,3309983000.0,Enjoy,0.0,0.0,0.0,False,0.166667,enjoy,,,
2,1.220059e+18,1.220059e+18,2020-01-22,19:59:54,1.032304e+18,#JustAskSachin Sir what is u r stands on CAA ...,0.0,0.0,0.0,False,0.212329,sir what is You stands on caa while ago You ha...,,justasksachin,
3,1.220054e+18,1.220054e+18,2020-01-22,19:43:49,88164340.0,THINKING OF A SEA CHANGE ? HERE IS YOUR OPPORT...,0.0,0.0,0.0,False,0.385714,thinking of sea change here is your opportunity,,residentialplot land forsale beachroad visakha...,
4,1.220051e+18,1.220011e+18,2020-01-22,19:30:00,1.160402e+18,@turagasudhakar @BeSriSri @prasana_kumar @saib...,0.0,0.0,0.0,False,0.054795,ok how deciding there in andhra three capital ...,turagasudhakar besrisri prasana_kumar saibolli...,,


In [10]:
#collecting all tweets written in Roman script to train word2vec

In [11]:
class tweet_collecter:
    
    def __init__(self, folder):
        self.folder = folder
        self.file_list = [file for file in os.listdir(self.folder) if ".csv" in file]
        self.tweets = self.extract_tweets()
    
    #take a dataframe as input and save it in a csv file
    @staticmethod
    def push_to_csv(_df):
        _df.to_csv("word2vec_tweets.csv", mode = "a", header = False, index = False)
        with open("word2vec_tweets.csv") as f:
            f.close()
        
    
    #load every file in the folder as a dataframe
    #drop duplicate tweets and any tweets with NaN in the "cleaned_tweet" column
    #use the push_to_csv method to save the "cleaned_tweet" and "id" column in a csv file
    def extract_tweets(self):
        for file in self.file_list:
            df = pickle.load(open(os.path.join(os.path.abspath(self.folder), file), "rb"))
            df = df.dropna(subset = ["cleaned_tweet"])
            df.drop_duplicates(subset = ["cleaned_tweet"], inplace = True)
            df.reset_index(inplace = True, drop = True)
            self.push_to_csv(df.loc[:, ["cleaned_tweet", "id"]])

In [None]:
#iterate through every folder in "Cleaned Data" folder and apply the above class to it

In [12]:
t_1 = time()
for _folder in os.listdir("Cleaned Data"):
    t_2 = time()
    _path = os.path.join("Cleaned Data", _folder)
    tweet_collecter(_path)
    print("completed {}".format(_folder))
    print("time taken to complete: {} mins".format(round((time() - t_2)/60, 2)))
print("time taken to complete program: {} mins".format(round((time() - t_1)/60, 2)))

completed cleaned Andhra_Pradesh
time taken to complete: 0.07 mins
time taken to complete program: 0.07 mins


In [12]:
#reading the file of collected tweets and removing duplicate tweets from them

In [13]:
tweets = pd.read_csv("word2vec_tweets.csv", header = None)

In [14]:
tweets.shape

(195870, 2)

In [15]:
tweets.drop_duplicates(subset = [0], inplace = True)
tweets.drop_duplicates(subset = [1], inplace = True)

In [16]:
tweets.shape

(175192, 2)

In [13]:
#writing the tweets to a txt file for feeding into the word2vec 

In [17]:
with open("word2vec_tweets.txt", "w", encoding = "utf-8") as f:
    for _row in tweets[0]:
        f.write(str(_row))
        f.write("\n")

In [15]:
#saving tweet ids to a pickle file

In [18]:
pickle.dump(tweets[1].values, open("unique tweet ids.pkl", "wb"))

In [16]:
#finding tweet ids of all tweets across all states, taking care to drop all duplicates within the same day and
#then adding all these ids to a list. Only the unique values in this list are retained as the final ids of
#state

In [19]:
t = time()
state_list = [state for state in os.listdir("Cleaned Data")]
for state in state_list:
    ids = np.array(0)
    t_1 = time()
    _path = os.path.join("Cleaned Data", state)
    file_list = [file for file in os.listdir(_path) if ".csv" in file]
    for file in file_list:
        df = pickle.load(open(os.path.join(_path, file), "rb"))
        df = df.where(df["id"].isin(tweets[1].values)).dropna(subset = ["id"])
        df.drop_duplicates(subset = ["id"], inplace = True)
        df.drop_duplicates(subset = ["cleaned_tweet"], inplace = True)
        ids = np.append(ids, df["id"].values)
    ids = np.unique(ids)
    pickle.dump(ids, open(os.path.join(_path, "{} ids.pkl".format(state.split()[-1])), "wb"))
print("time taken: {} mins".format(round((time() - t)/60, 2)))

time taken: 0.03 mins


In [None]:
#eliminating duplicate tweets across states. code snippet compares the ids list of two states, keeping only the
#unique values of list 2 in list 2, and saving list 1 as is

In [20]:
state_list = os.listdir("Cleaned Data")
for state in state_list:
    search_list = state_list[state_list.index(state) + 1: ]
    _path = os.path.join("Cleaned Data", state)
    state_ids = pickle.load(open(os.path.join(_path, "{} ids.pkl".format(state.split()[-1])), "rb"))
    for _state in search_list:
        _path_2 = os.path.join("Cleaned Data", _state)
        _state_ids = pickle.load(open(os.path.join(_path_2, "{} ids.pkl".format(_state.split()[-1])), "rb"))
        mask = np.isin(_state_ids, state_ids, invert = True)
        _state = _state_ids[mask]
        pickle.dump(_state_ids, open(os.path.join(_path_2, "{} ids.pkl".format(_state.split()[-1])), "wb"))
    pickle.dump(state_ids, open(os.path.join(_path, "{} ids.pkl".format(state.split()[-1])), "wb"))

In [2]:
#tweets were scraped from EU timezone, so fixing the date and time mismatch such that each tweet carries the
#correct time

In [3]:
#first we convert the timezone from EU to Asia/Kolkata such that each tweet shows its India time

In [4]:
class timezone_converter:
    
    def __init__(self, _folder):
        self.folder = _folder
        self.file_list = [_file for _file in os.listdir(self.folder) if ".csv" in _file]
        self.convert = self.convert_timezone()
        
    def convert_timezone(self):
        for _file in self.file_list:
            _filepath = os.path.join(self.folder, _file)
            with open(_filepath, "rb") as f:
                _df = pickle.load(f)
            _df["date_time"] = pd.to_datetime(_df["date"] + " " + _df["time"], utc = True)
            _df["date_time"] = _df["date_time"].dt.tz_convert("Asia/Kolkata").dt.tz_localize(None)
            with open(_filepath, "wb") as f:
                pickle.dump(_df, f)

In [6]:
t = time()
state_list = os.listdir("Cleaned Data")
for _state in state_list:
    t_1 = time()
    _path = os.path.join("Cleaned Data", _state)
    timezone_converter(_path)
    print("{} finished".format(_state.split()[-1]))
    print("time taken: {} mins".format(round((time() - t_1)/60, 2)))
print("time taken: {} mins".format(round((time() - t)/60, 2)))

Andhra_Pradesh finished
time taken: 0.05 mins
time taken: 0.05 mins


In [7]:
test = pickle.load(open("Cleaned Data\\cleaned Andhra_Pradesh\\cleaned 2020-01-22.csv.pkl", "rb"))

In [8]:
test.shape

(1195, 18)

In [13]:
test.head()

Unnamed: 0,id,conversation_id,date,time,user_id,tweet,retweets_count,replies_count,likes_count,retweet,caps_share,cleaned_tweet,target,hashtag,emojis,polarity_new,muslim,date_time
0,1.220133e+18,1.220133e+18,2020-01-23,00:57:35,63411110.0,Follow @tokslabossmua on YouTube: thanks so mu...,0.0,0.0,0.0,False,0.086957,follow on youtube thanks so much dossier perfu...,tokslabossmua,,,1.0,0,2020-01-23 06:27:35
1,1.220077e+18,1.220077e+18,2020-01-22,21:12:47,3309983000.0,Enjoy,0.0,0.0,0.0,False,0.166667,enjoy,,,,1.0,0,2020-01-23 02:42:47
2,1.220059e+18,1.220059e+18,2020-01-22,19:59:54,1.032304e+18,#JustAskSachin Sir what is u r stands on CAA ...,0.0,0.0,0.0,False,0.212329,sir what is You stands on caa while ago You ha...,,justasksachin,,1.0,0,2020-01-23 01:29:54
3,1.220054e+18,1.220054e+18,2020-01-22,19:43:49,88164340.0,THINKING OF A SEA CHANGE ? HERE IS YOUR OPPORT...,0.0,0.0,0.0,False,0.385714,thinking of sea change here is your opportunity,,residentialplot land forsale beachroad visakha...,,1.0,0,2020-01-23 01:13:49
4,1.220051e+18,1.220011e+18,2020-01-22,19:30:00,1.160402e+18,@turagasudhakar @BeSriSri @prasana_kumar @saib...,0.0,0.0,0.0,False,0.054795,ok how deciding there in andhra three capital ...,turagasudhakar besrisri prasana_kumar saibolli...,,,1.0,0,2020-01-23 01:00:00


In [10]:
#going through each state in "Cleaned Data" folder. loading each file and the next day's file. 
#keep 1st day's tweets in 1st day's file and transfer the next day's tweets to the 2nd file

In [11]:
class arrange_dates:
    
    def __init__(self, _folder):
        self.folder = _folder
        self.file_list = [_file for _file in os.listdir(self.folder) if ".csv" in _file]
        self.fix_dates = self.transfer_data()
        
    def transfer_data(self):
        for _file in self.file_list:
            try:
                _date = _file.rpartition(".csv")[0].rpartition(" ")[2]
                with open(os.path.join(os.path.abspath(self.folder), _file), "rb") as f:
                    same_day_df = pickle.load(f)
                same_day_df.drop_duplicates(subset = ["id"], inplace = True)
                same_day_df.reset_index(drop = True, inplace = True)
                day_end_timestamp = pd.to_datetime(_date, format = "%Y-%m-%d") + pd.Timedelta("1 days")
                next_day = str(day_end_timestamp).rpartition(" ")[0]
                with open(os.path.join(os.path.abspath(self.folder), "cleaned {}.csv.pkl".format(next_day)), "rb") as f:
                    next_day_df = pickle.load(f)
                next_day_df.drop_duplicates(subset = ["id"], inplace = True)
                next_day_df.reset_index(drop = True, inplace = True)
                next_day_df = pd.concat([next_day_df, same_day_df[same_day_df["date_time"] >= day_end_timestamp]], axis = 0, ignore_index = True)
                next_day_df.drop_duplicates(subset = ["id"], inplace = True)
                next_day_df.reset_index(drop = True, inplace = True)
                same_day_df = same_day_df[same_day_df["date_time"] <= day_end_timestamp]
                same_day_df.drop_duplicates(subset = ["id"], inplace = True)
                same_day_df.reset_index(drop = True, inplace = True)
                with open(os.path.join(os.path.abspath(self.folder), _file), "wb") as f:
                    pickle.dump(same_day_df, f)
                with open(os.path.join(os.path.abspath(self.folder), "cleaned {}.csv.pkl".format(next_day)), "wb") as f:
                    pickle.dump(next_day_df, f)
            except:
                pass
                

In [16]:
#iterating through all the states in the "Cleaned Data" folder and applying the defined class to each folder

In [15]:
t = time()
state_list = os.listdir("Cleaned Data")
for state in state_list:
    t_1 = time()
    _path = os.path.join("Cleaned Data", state)
    arrange_dates(_path)
    print("{} complete".format(state.split()[-1]))
    print("time taken: {} mins".format(round((time() - t_1)/60, 2)))
print("time taken: {} mins".format(round((time() - t)/60, 2)))

Andhra_Pradesh complete
time taken: 0.09 mins
time taken: 0.09 mins


In [17]:
test = pickle.load(open("Cleaned Data\\cleaned Andhra_Pradesh\\cleaned 2020-01-22.csv.pkl", "rb"))

In [18]:
test.shape

(802, 18)

In [19]:
test.head()

Unnamed: 0,id,conversation_id,date,time,user_id,tweet,retweets_count,replies_count,likes_count,retweet,caps_share,cleaned_tweet,target,hashtag,emojis,polarity_new,muslim,date_time
0,1.220025e+18,1.220025e+18,2020-01-22,17:47:44,485463800.0,"Even after watching 5th time, felt emotional i...",0.0,0.0,2.0,False,0.116279,even after watching time felt emotional in pre...,anilravipudi anilsunkara1 urstrulymahesh,sarileruneekevvaru blockbustersarileruneekevva...,folded_hands thumbs_up,1.0,0,2020-01-22 23:17:44
1,1.220021e+18,1.219905e+18,2020-01-22,17:32:09,8.768553e+17,@DishPatani just amazing https://t.co/dq2t76yhRr,0.0,0.0,0.0,False,0.065217,just amazing,dishpatani,,,1.0,0,2020-01-22 23:02:09
2,1.220021e+18,1.220021e+18,2020-01-22,17:31:06,1.059703e+18,@sexeducation why this girl have little screen...,0.0,0.0,0.0,False,0.068376,why this girl have little screen time in both ...,sexeducation,sexeducations2,,1.0,0,2020-01-22 23:01:06
3,1.220021e+18,1.220021e+18,2020-01-22,17:29:31,101431200.0,@airindiain @HardeepSPuri Air India loses my l...,0.0,1.0,0.0,False,0.081301,air india loses my luggage and is clueless abo...,airindiain hardeepspuri,,,1.0,0,2020-01-22 22:59:31
4,1.22002e+18,1.22002e+18,2020-01-22,17:26:30,153823100.0,Lit the fire 🔥 to drive away the chill. @ Hari...,0.0,0.0,0.0,False,0.095238,lit the fire to drive away the chill haritha h...,,,fire,2.0,0,2020-01-22 22:56:30
