In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\debac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\debac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df_modi = pd.read_csv("../datasets/ModiRelatedTweetsWithSentiment.csv")
df_rahul = pd.read_csv("../datasets/RahulRelatedTweetsWithSentiment.csv")
df_modi = df_modi.drop(df_modi.columns[0], axis=1)
df_rahul = df_rahul.drop(df_rahul.columns[0], axis=1)

print(df_modi.head())
print(df_rahul.head())

                        Date             User  \
0  2019-05-18 23:50:47+00:00  advosushildixit   
1  2019-05-18 23:00:03+00:00           jiaeur   
2  2019-05-18 22:53:54+00:00    PVenkatGandhi   
3  2019-05-18 22:20:48+00:00      TheNirbhay1   
4  2019-05-18 21:22:29+00:00      ShakeChilli   

                                               Tweet Emotion  
0  @anjanaomkashyap I am seeing you as future #bj...     neg  
1  #LokSabhaElections2019 \n23rd May 2019 will re...     neg  
2  #LokSabhaElections2019 \n23rd May 2019 will re...     neg  
3  PM Modi creates a new record of being the only...     pos  
4  @abhijitmajumder Appointment of Successor! \n\...     pos  
                        Date             User  \
0  2019-05-18 19:49:52+00:00     Sunnysweet16   
1  2019-05-18 18:56:52+00:00    drnitinchaube   
2  2019-05-18 18:54:01+00:00        mrvivek07   
3  2019-05-18 18:52:03+00:00    JosephPravinP   
4  2019-05-18 18:31:10+00:00  VandanaMegastar   

                                

In [4]:
def preprocess_text(text):
    # Check if text is a string
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove emojis (if needed)
        text = text.encode('ascii', 'ignore').decode('ascii')
        # Tokenization and removal of stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return " ".join(words)
    else:
        return ""  # Return empty string for non-string values

In [5]:
# Apply text preprocessing to each tweet
df_modi['Cleaned Tweet'] = df_modi['Tweet'].fillna('').apply(preprocess_text)
df_rahul['Cleaned Tweet'] = df_rahul['Tweet'].fillna('').apply(preprocess_text)

# Encode sentiment labels (positive: 1, negative: 0)
df_modi['Encoded Emotion'] = df_modi['Emotion'].map({'pos': 1, 'neg': 0})
df_rahul['Encoded Emotion'] = df_rahul['Emotion'].map({'pos': 1, 'neg': 0})

#print(df_modi.head())
#print(df_rahul.head())

In [6]:
df_modi['Source'] = 'Modi'
df_rahul['Source'] = 'Rahul'
df_combined = pd.concat([df_modi, df_rahul], ignore_index=True)
df_combined.dropna(inplace=True)
print(df_combined.head())

                        Date             User  \
0  2019-05-18 23:50:47+00:00  advosushildixit   
1  2019-05-18 23:00:03+00:00           jiaeur   
2  2019-05-18 22:53:54+00:00    PVenkatGandhi   
3  2019-05-18 22:20:48+00:00      TheNirbhay1   
4  2019-05-18 21:22:29+00:00      ShakeChilli   

                                               Tweet Emotion  \
0  @anjanaomkashyap I am seeing you as future #bj...     neg   
1  #LokSabhaElections2019 \n23rd May 2019 will re...     neg   
2  #LokSabhaElections2019 \n23rd May 2019 will re...     neg   
3  PM Modi creates a new record of being the only...     pos   
4  @abhijitmajumder Appointment of Successor! \n\...     pos   

                                       Cleaned Tweet  Encoded Emotion Source  
0  anjanaomkashyap seeing future bjp spokesperson...              0.0   Modi  
1  loksabhaelections2019 23rd may 2019 reveal eve...              0.0   Modi  
2  loksabhaelections2019 23rd may 2019 reveal eve...              0.0   Modi  
3  p

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_combined['Cleaned Tweet'], df_combined['Encoded Emotion'], test_size=0.2, random_state=42)

print(X_train.head())
print(y_train.head())
print(X_test.head())
print(y_test.head())

38239    shashitharoor atleast rahulgandhi couldve forw...
17462    smart utilization communication channel touch ...
15440    campaign loksabhaelections2019 coming end toda...
30419    pun_starr irony many people confused rahulgand...
22459    visited 200 house last 2 day love n affection ...
Name: Cleaned Tweet, dtype: object
38239    0.0
17462    1.0
15440    1.0
30419    0.0
22459    1.0
Name: Encoded Emotion, dtype: float64
6524     modi thanos gathbandhan godamn avenger releasi...
30909    mkstalin addressal kanyakumari said rahul gand...
36466    already u lost seat madhubani loksabhaelection...
9801     one way modi saying anyone fight corruption ch...
25645     thank ani narendramodi ji interview narendramodi
Name: Cleaned Tweet, dtype: object
6524     1.0
30909    0.0
36466    0.0
9801     0.0
25645    1.0
Name: Encoded Emotion, dtype: float64


In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(type(X_train_tfidf))
print(type(X_test_tfidf))
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
(31953, 5000)
(7989, 5000)


In [9]:
from itertools import product
from math import prod
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define a list of parameter combinations to test
C= [0.001, 0.01, 0.1, 1, 10, 100]
penalty= ['l1', 'l2']
max_iter= [100, 1000, 10000]

best_accuracy = 0
best_params = None

# Train multiple models with different parameter combinations
for C, penalty, max_iter in product(C, penalty, max_iter):   
    print("Training model with parameters:", C, penalty, max_iter)
    clf = LogisticRegression(C=C, penalty=penalty, max_iter=max_iter, solver='liblinear')
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print("Parameters:", C, penalty, max_iter)
    print("Accuracy:", accuracy)
    
    # Update best parameters if the current model has higher accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = (C, penalty, max_iter)

print("Best parameters:", best_params)
print("Best accuracy:", best_accuracy)


Training model with parameters: 0.001 l1 100
Parameters: 0.001 l1 100
Accuracy: 0.5786706721742396
Training model with parameters: 0.001 l1 1000
Parameters: 0.001 l1 1000
Accuracy: 0.5786706721742396
Training model with parameters: 0.001 l1 10000
Parameters: 0.001 l1 10000
Accuracy: 0.5786706721742396
Training model with parameters: 0.001 l2 100
Parameters: 0.001 l2 100
Accuracy: 0.5786706721742396
Training model with parameters: 0.001 l2 1000
Parameters: 0.001 l2 1000
Accuracy: 0.5786706721742396
Training model with parameters: 0.001 l2 10000
Parameters: 0.001 l2 10000
Accuracy: 0.5786706721742396
Training model with parameters: 0.01 l1 100
Parameters: 0.01 l1 100
Accuracy: 0.5836775566403806
Training model with parameters: 0.01 l1 1000
Parameters: 0.01 l1 1000
Accuracy: 0.5836775566403806
Training model with parameters: 0.01 l1 10000
Parameters: 0.01 l1 10000
Accuracy: 0.5836775566403806
Training model with parameters: 0.01 l2 100
Parameters: 0.01 l2 100
Accuracy: 0.6178495431217924


In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from itertools import product

# Define the hyperparameters to tune
C_values = [10]
kernel = ['rbf']

best_accuracy = 0
best_params = {}

# Iterate over each combination of hyperparameters
for C, kernel in product(C_values, kernel):
    print("Training model with parameters:", C, kernel)
    clf = SVC(C=C, kernel=kernel)
    clf.fit(X_train_tfidf, y_train)  # Train the model on the training data
    y_pred = clf.predict(X_test_tfidf)  # Predict the labels for the validation set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy on the validation set
    print("Parameters:", C, kernel)
    print("Accuracy:", accuracy)
    # Check if the current model has the highest accuracy so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {'C': C, 'kernel': kernel}

# Print the best parameters and corresponding accuracy
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Training model with parameters: 10 rbf
Parameters: 10 rbf
Accuracy: 0.7897108524220804
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Accuracy: 0.7897108524220804


In [11]:
from sklearn.metrics import classification_report

# Assuming that `y_test` are your true labels and `model` is your trained model
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.84      0.82      4623
         1.0       0.77      0.71      0.74      3366

    accuracy                           0.79      7989
   macro avg       0.79      0.78      0.78      7989
weighted avg       0.79      0.79      0.79      7989



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from itertools import product

# Define the hyperparameters to tune
n_estimators = [50, 100, 200]
max_depth = [None, 10, 20, 30]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

best_accuracy = 0
best_params = {}

# Iterate over each combination of hyperparameters
for n_estimators, max_depth, min_samples_split, min_samples_leaf in product(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    print("Training model with parameters:", n_estimators, max_depth, min_samples_split, min_samples_leaf)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    clf.fit(X_train_tfidf, y_train)  # Train the model on the training data
    y_pred = clf.predict(X_test_tfidf)  # Predict the labels for the validation set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy on the validation set
    print("Parameters:", n_estimators, max_depth, min_samples_split, min_samples_leaf)
    print("Accuracy:", accuracy)
    
    # Check if the current model has the highest accuracy so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

# Print the best parameters and corresponding accuracy
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)


Training model with parameters: 50 None 2 1
Parameters: 50 None 2 1
Accuracy: 0.7612967830767305
Training model with parameters: 50 None 2 2
Parameters: 50 None 2 2
Accuracy: 0.7470271623482289
Training model with parameters: 50 None 2 4
Parameters: 50 None 2 4
Accuracy: 0.7268744523720115
Training model with parameters: 50 None 5 1
Parameters: 50 None 5 1
Accuracy: 0.7605457504068094
Training model with parameters: 50 None 5 2
Parameters: 50 None 5 2
Accuracy: 0.7441482037801977
Training model with parameters: 50 None 5 4
Parameters: 50 None 5 4
Accuracy: 0.7306296157216172
Training model with parameters: 50 None 10 1
Parameters: 50 None 10 1
Accuracy: 0.7566654149455502
Training model with parameters: 50 None 10 2
Parameters: 50 None 10 2
Accuracy: 0.748028539241457
Training model with parameters: 50 None 10 4
Parameters: 50 None 10 4
Accuracy: 0.7326323695080736
Training model with parameters: 50 10 2 1
Parameters: 50 10 2 1
Accuracy: 0.6165978220052573
Training model with parameter

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer

embeddings_index = {}
embedding_dim = 100  # Change this to match the embedding size you downloaded

with open('../datasets/glove.twitter.27B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


# Define maximum number of words in each tweet and maximum word index
MAX_WORDS = 280
MAX_WORD_INDEX = 50000

# Tokenize and pad sequences
#X_train_tfidf = X_train_tfidf.tolist()
#X_test_tfidf = X_test_tfidf.tolist()
tokenizer = Tokenizer(num_words=MAX_WORD_INDEX)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_WORDS)
X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_WORDS)

# Create an embedding matrix
embedding_matrix = np.zeros((MAX_WORD_INDEX, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < MAX_WORD_INDEX:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector

# Define the model
model = Sequential()
model.add(Embedding(input_dim=MAX_WORD_INDEX, output_dim=embedding_dim, weights=[embedding_matrix], input_length=MAX_WORDS, trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with different parameters
for num_epochs in [5]:
    for batch_size in [128]:
        print(f"Training with num_epochs={num_epochs} and batch_size={batch_size}")
        model.fit(X_train_padded, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_padded, y_test), verbose=0)
        _, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
        print(f"Accuracy: {accuracy}")
        
model.summary()



Training with num_epochs=5 and batch_size=128
Accuracy: 0.7775691747665405


In [14]:
from sklearn.metrics import classification_report
import numpy as np
y_pred = model.predict(X_test_padded)
y_pred = (y_pred > 0.5).astype(int)
y_pred = y_pred.flatten()
y_test = np.array(y_test)
print(classification_report(y_test, y_pred))


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 90ms/step
              precision    recall  f1-score   support

         0.0       0.78      0.85      0.82      4623
         1.0       0.77      0.68      0.72      3366

    accuracy                           0.78      7989
   macro avg       0.78      0.76      0.77      7989
weighted avg       0.78      0.78      0.78      7989

