In [1]:
import pandas as pd
import os
import re
import numpy as np
import csv
from time import time
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from gensim.models import Word2Vec
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
#loading the trained classifier

In [None]:
cls = pickle.load(open("classifier.pkl", "rb"))

In [20]:
#loading the word2vec model

In [None]:
wv_model = pickle.load(open("wv_model.pkl", "rb"))
wv = wv_model.wv

In [5]:
#loading the emoji vectorizer

In [None]:
vectorizer = pickle.load(open("emoji vectorizer.pkl", "rb"))

In [4]:
#predicting sentiment for all the scraped tweets

In [21]:
#creating a class that accepts the state_name, file_name, unique_ids of the state, and the classifier
#it loads every file, creates the emoji matrix for the dataframe, uses word2vec model to vectorize all the 
#tweets in the df, then predicts and stores the sentiment of every tweet under the "polarity_new" column

In [6]:
class sentiment_analyzer:
    
    def __init__(self, _folder, _file, ids_list,_classifier):
        self.folder = _folder
        self.file = _file
        self.filepath = os.path.join(os.path.abspath(self.folder), self.file)
        self.pipe = _classifier
        self.ids_list = ids_list
        self.df = self.load_df()
        self.emojis_array = self.emoji_vectorize()
        self.corpus = self.vectorize_data()
        self.predicted = self.predict_score()
        self.output = self.save_output()
        
        
    def load_df(self):
        with open(self.filepath, "rb") as f:
            _df = pickle.load(f)
        _df = _df.where(_df["id"].isin(self.ids_list)).dropna(subset = ["id"])
        _df.reset_index(drop = True, inplace = True)
        return _df
    
    
    def emoji_vectorize(self):
        _emojis = vectorizer.transform(self.df["emojis"].values)
        return _emojis.toarray()
    
        
    def vectorize_data(self):
        corpus_mat = np.empty((0, 300))
        for _sentence in self.df["cleaned_tweet"]:
            sentence_mat = np.empty((300, 0))
            for _word in _sentence.split():
                if _word in wv.key_to_index.keys():
                    sentence_mat = np.column_stack([sentence_mat, wv[_word]])
                else:
                    sentence_mat = np.column_stack([sentence_mat, np.zeros(300)])
            sentence_vec = np.mean(sentence_mat, axis = 1)
            corpus_mat = np.vstack([corpus_mat, sentence_vec])
        corpus = np.column_stack([corpus_mat, self.emojis_array])
        corpus = np.column_stack([corpus, self.df["caps_share"].values])
        return corpus
    
    
    def predict_score(self):
        try:
            _predict = self.pipe.predict(self.corpus)
            self.df["polarity_new"] = _predict
        except:
            pass
        return self.df
    
    
    def save_output(self):
        with open(self.filepath, "wb") as f:
            pickle.dump(self.predicted, f)

In [None]:
t = time()
state_list = os.listdir("Cleaned Data")
for state in state_list:
    t_1 = time()
    path = os.path.join("Cleaned Data", state)
    file_list = [file for file in os.listdir(path) if ".csv" in file]
    ids = pickle.load(open(os.path.join(path, "{} ids.pkl".format(state.split()[-1])), "rb"))
    for file in file_list:
        sentiment_analyzer(path, file, ids, cls)
    print("{} completed".format(state.split()[-1]))
    print("time taken: {} mins".format(round((time() - t_1)/60, 2)))
print("time taken: {} mins".format(round((time() - t)/60, 2)))