In [3]:
import pandas as pd
import os
import re
import numpy as np
import csv
from time import time
import pickle
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from gensim.models import Word2Vec
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
#loading the trained classifier

In [5]:
cls = pickle.load(open("classifier.pkl", "rb"))

In [20]:
#loading the word2vec model

In [6]:
wv_model = pickle.load(open("wv_model.pkl", "rb"))
wv = wv_model.wv

In [5]:
#loading the emoji vectorizer

In [8]:
vectorizer = pickle.load(open("emoji vectorizer.pkl", "rb"))

In [4]:
#predicting sentiment for all the scraped tweets

In [21]:
#creating a class that accepts the state_name, file_name, unique_ids of the state, and the classifier
#it loads every file, creates the emoji matrix for the dataframe, uses word2vec model to vectorize all the 
#tweets in the df, then predicts and stores the sentiment of every tweet under the "polarity_new" column

In [9]:
class sentiment_analyzer:
    
    def __init__(self, _folder, _file, ids_list,_classifier):
        self.folder = _folder
        self.file = _file
        self.filepath = os.path.join(os.path.abspath(self.folder), self.file)
        self.pipe = _classifier
        self.ids_list = ids_list
        self.df = self.load_df()
        self.emojis_array = self.emoji_vectorize()
        self.corpus = self.vectorize_data()
        self.predicted = self.predict_score()
        self.output = self.save_output()
        
    #load the particular file as a dataframe    
    def load_df(self):
        with open(self.filepath, "rb") as f:
            _df = pickle.load(f)
        _df = _df.where(_df["id"].isin(self.ids_list)).dropna(subset = ["id"])
        _df.reset_index(drop = True, inplace = True)
        return _df
    
    #load the emoji vectorizer trained before and create a (,162) vector representating the emojis in the tweet
    #for every tweet to create a (df.shape[0], 162) matrix
    def emoji_vectorize(self):
        _emojis = vectorizer.transform(self.df["emojis"].values)
        return _emojis.toarray()
    
    #create a (0, 300) matrix for the day. split the tweet into tokens and create a (300,0) matrix for the tweet. 
    #if the token is in the word2vec model vocab, add the (300,) vector into the matrix, else add a (300,) 0-vector
    #average the matrix across the y-axis to get a (300,) vector representating the tweet.
    #add this (,300) vector to the day matrix. to this matrix, add the (df.shape[0], 162) matrix and the 
    #(df.shape[0], 1) "caps share" vector to create final matrix for the day
    def vectorize_data(self):
        corpus_mat = np.empty((0, 300))
        for _sentence in self.df["cleaned_tweet"]:
            sentence_mat = np.empty((300, 0))
            for _word in _sentence.split():
                if _word in wv.key_to_index.keys():
                    sentence_mat = np.column_stack([sentence_mat, wv[_word]])
                else:
                    sentence_mat = np.column_stack([sentence_mat, np.zeros(300)])
            sentence_vec = np.mean(sentence_mat, axis = 1)
            corpus_mat = np.vstack([corpus_mat, sentence_vec])
        corpus = np.column_stack([corpus_mat, self.emojis_array])
        corpus = np.column_stack([corpus, self.df["caps_share"].values])
        return corpus
    
    #predict the sentiment of each tweet, and add the column containing sentiment score of each tweet to the
    #original dataframe
    def predict_score(self):
        try:
            _predict = self.pipe.predict(self.corpus)
            self.df["polarity_new"] = _predict
        except:
            pass
        return self.df
    
    
    def save_output(self):
        with open(self.filepath, "wb") as f:
            pickle.dump(self.predicted, f)

In [None]:
#apply the class above to every file, in every state in the "Cleaned Data" folder

In [10]:
t = time()
state_list = os.listdir("Cleaned Data")
for state in state_list:
    t_1 = time()
    path = os.path.join("Cleaned Data", state)
    file_list = [file for file in os.listdir(path) if ".csv" in file]
    ids = pickle.load(open(os.path.join(path, "{} ids.pkl".format(state.split()[-1])), "rb"))
    for file in file_list:
        sentiment_analyzer(path, file, ids, cls)
    print("{} completed".format(state.split()[-1]))
    print("time taken: {} mins".format(round((time() - t_1)/60, 2)))
print("time taken: {} mins".format(round((time() - t)/60, 2)))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


Andhra_Pradesh completed
time taken: 40.08 mins
time taken: 40.08 mins


In [12]:
test = pickle.load(open("Cleaned Data\\cleaned Andhra_Pradesh\\cleaned 2020-01-22.csv.pkl", "rb"))

In [13]:
test.shape

(1195, 16)

In [14]:
test.head()

Unnamed: 0,id,conversation_id,date,time,user_id,tweet,retweets_count,replies_count,likes_count,retweet,caps_share,cleaned_tweet,target,hashtag,emojis,polarity_new
0,1.220133e+18,1.220133e+18,2020-01-23,00:57:35,63411110.0,Follow @tokslabossmua on YouTube: thanks so mu...,0.0,0.0,0.0,False,0.086957,follow on youtube thanks so much dossier perfu...,tokslabossmua,,,1.0
1,1.220077e+18,1.220077e+18,2020-01-22,21:12:47,3309983000.0,Enjoy,0.0,0.0,0.0,False,0.166667,enjoy,,,,1.0
2,1.220059e+18,1.220059e+18,2020-01-22,19:59:54,1.032304e+18,#JustAskSachin Sir what is u r stands on CAA ...,0.0,0.0,0.0,False,0.212329,sir what is You stands on caa while ago You ha...,,justasksachin,,1.0
3,1.220054e+18,1.220054e+18,2020-01-22,19:43:49,88164340.0,THINKING OF A SEA CHANGE ? HERE IS YOUR OPPORT...,0.0,0.0,0.0,False,0.385714,thinking of sea change here is your opportunity,,residentialplot land forsale beachroad visakha...,,1.0
4,1.220051e+18,1.220011e+18,2020-01-22,19:30:00,1.160402e+18,@turagasudhakar @BeSriSri @prasana_kumar @saib...,0.0,0.0,0.0,False,0.054795,ok how deciding there in andhra three capital ...,turagasudhakar besrisri prasana_kumar saibolli...,,,1.0
