In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from wordcloud import WordCloud
import pickle
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

In [6]:
# Load the VADER model
sid = SentimentIntensityAnalyzer()

# Load the trained RNN model
rnn_model = load_model('../models/sentiment_model.h5')



In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

# Text preprocessing function
def preprocess_text(text):
    # Check if text is a string
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove emojis (if needed)
        text = text.encode('ascii', 'ignore').decode('ascii')
        # Tokenization and removal of stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return " ".join(words)
    else:
        return ""  # Return empty string for non-string values

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\debac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\debac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Load the GloVe word embeddings

embeddings_index = {}
embedding_dim = 100

with open('../datasets/glove.twitter.27B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 1193515 word vectors.


In [9]:
# Tokenize the text data

MAX_WORDS = 280
MAX_WORD_INDEX = 50000
embedding_dim = 100

def tokenize_text(text):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)

    tokenized_text = tokenizer.texts_to_sequences(text)
    padded_text = pad_sequences(tokenized_text, maxlen=MAX_WORDS)

    embedding_matrix = np.zeros((MAX_WORD_INDEX, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i < MAX_WORD_INDEX:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # Words not found in the embedding index will be all zeros
                embedding_matrix[i] = embedding_vector

    return padded_text

In [10]:
# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    ss = sid.polarity_scores(text)
    print(ss['compound'])
    return ss['compound']

    # if ss['compound'] >= 0.05:
    #     return 1
    # elif ss['compound'] <= -0.05:
    #     return -1
    # else:
    #     return 0

In [11]:
# Function to analyze sentiment using RNN
def analyze_sentiment_rnn(padded_text):
    # Predict the sentiment
    pred = rnn_model.predict(padded_text)
    normalized_predictions = (pred - 0.5) * 2
    return normalized_predictions

    # if pred >= 0.5:
    #     return 1
    # else:
    #     return -1

In [12]:
def combine_predictions(text):
    rnn_weight = 0.5
    vader_weight = 1-rnn_weight
    text = preprocess_text(text)
    padded_text = tokenize_text([text])
    prediction_rnn = analyze_sentiment_rnn(padded_text)
    prediction_vader = analyze_sentiment_vader(text)
    prediction = (rnn_weight * prediction_rnn) + (vader_weight * prediction_vader)
    return prediction

In [15]:
QUERY = 'bjp'
df = pd.read_csv('../datasets/IndianElection19TwitterData.csv')

df.drop(['Unnamed: 0'], axis=1, inplace=True)
df = df.sample(frac=1)
df_filtered = df[df['Tweet'].str.contains(QUERY)]
df_filtered.reset_index(drop=True, inplace=True)
df_filtered.head()

Unnamed: 0,Date,User,Tweet
0,2019-04-14 10:27:32+00:00,krishind07,@sagarikaghose @BJP4India @sanjaycsds If every...
1,2019-03-07 11:23:56+00:00,Chintandjoshi,When your team gets appreciated by member of p...
2,2019-03-22 13:56:28+00:00,akj_1981,@GautamGambhir We are very Happy on your decis...
3,2019-03-28 14:41:53+00:00,leenasriv,@abhisar_sharma #bjp seeks votes n #congress b...
4,2019-04-01 14:55:05+00:00,SinhaVikaz1,@Patriot18074824 @spendurti @muglikar_ @RahulG...


In [24]:
pos_count = 0
neg_count = 0
neu_count = 0
for i in range(min(1000, len(df_filtered))):
    text = df_filtered['Tweet'][i]
    
    prediction = combine_predictions(text)
    if prediction > 0:
        pos_count = pos_count + 1
    elif prediction < 0:
        neg_count = neg_count + 1
    else:
        neu_count = neu_count + 1

print('Positive tweets:', pos_count)
print('Negative tweets:', neg_count)
print('Neutral tweets:', neu_count)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
0.1779
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
0.8519
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
0.7096
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
0.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
0.9081
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
0.5719
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
0.3612
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
-0.3818
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
0.2584
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
-0.7003
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
0.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
0.7269
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [26]:

usernames = list(df_filtered['User'][:i])
username_counts = Counter(usernames)
top_usernames = username_counts.most_common(10)
usernames, counts = zip(*top_usernames) if top_usernames else ([], [])
usernames, counts

(('GomathiRaghava4',
  'JKV71',
  'Scorpion1007',
  'bengapinto',
  'bharathbunny27',
  'pramodlunia',
  'RamUK_R',
  'shotsbyvishal',
  'drnitinchaube',
  'mannan_pathan'),
 (50, 20, 20, 9, 5, 5, 5, 4, 4, 4))