In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv('Sentiment_Stock_data (1).csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.drop(data.loc[data['Sentence'] == ''].index, inplace=True)
# data[data['Sentence'] == '']

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.dropna(axis=0, inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data.drop(columns=['Unnamed: 0'],axis=1, inplace=True)

In [None]:
#remove punctuations
import string
def remove_punctuations(text):
    filtered=''
    for i in text:
        if i not in string.punctuation:
            filtered +=i
    return filtered
data['Sentence']=data['Sentence'].apply(remove_punctuations)

In [None]:
#convert to lowercase
data['Sentence']=data['Sentence'].apply(lambda x:x.lower())

In [None]:
#remove emojis
def remove_emojis(text):
    filtered=''
    for i in text:
        if i.isascii():
            filtered +=i
    return filtered
data['Sentence']=data['Sentence'].apply(remove_emojis)

In [None]:
#remove stopwords
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# stopwords=set(stopwords.words('english'))
# def tokenize(text):
#     words=text.split()
#     cleaned=[]
#     for i in words:
#         if i not in stopwords:
#             cleaned.append(i)
#     return ' '.join(cleaned)
# data['Sentence']=data['Sentence'].apply(tokenize)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources (run once)
nltk.download('stopwords')

# Now load stopwords
stop_words = set(stopwords.words('english'))

def tokenize(text):
    words = text.split()
    cleaned = []
    for w in words:
        if w.lower() not in stop_words:
            cleaned.append(w)
    return " ".join(cleaned)

data['Sentence'] = data['Sentence'].apply(tokenize)


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()

def lemmatize(text):
    words=text.split()
    lemmatized=[lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

data['Sentence']=data['Sentence'].apply(lemmatize)

print(data.head())

In [None]:
data['no of words']=data['Sentence'].apply(lambda x:len(x.split()))

In [None]:
#now convert text into vectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [None]:
sns.countplot(x=data['Sentiment'])


In [None]:
sns.histplot(x=data['no of words'], kde=True)


In [None]:
sns.heatmap(data.corr(numeric_only=True),annot=True)

In [None]:
len(data[data['no of words'] < 20])
data.head(20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['Sentence'], data['Sentiment'], test_size=0.25, random_state=42, shuffle=True)

In [None]:
bow=CountVectorizer()
tfidf=TfidfVectorizer(
    max_features=60000,      # reduce vocabulary
    ngram_range=(1,3),      # add bigrams (very important for finance)
    min_df=3,               # remove rare words
    max_df=0.9,
)

In [None]:
X_train_bow=bow.fit_transform(X_train)
X_test_bow=bow.transform(X_test)

X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [None]:
models_bow={
    'logistic_regression':LogisticRegression(max_iter=2000),
    'naive_bayes':MultinomialNB(),
    'SVM_bow':LinearSVC()
}

models_tfidf={
     'logistic_regression':LogisticRegression(max_iter=2000),
     'naive_bayes':MultinomialNB(),
     'SVM_tfidf':LinearSVC()
}


In [None]:
y_predict_bow={}
y_predict_tfidf={}

for i in models_bow:
    models_bow[i].fit(X_train_bow, y_train)
    y_predict_bow[i]=models_bow[i].predict(X_test_bow)

for i in models_tfidf:
    models_tfidf[i].fit(X_train_tfidf, y_train)
    y_predict_tfidf[i]=models_tfidf[i].predict(X_test_tfidf)


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("\n---- TF-IDF Accuracies ----")
for model_name in y_predict_tfidf:
    print(f"{model_name}: {accuracy_score(y_test, y_predict_tfidf[model_name])*100:.2f}%")
    # print(f"{model_name}: {confusion_matrix(y_test, y_predict_tfidf[model_name])*100:.2f}%")

print("\n\n---- Bag of Words Accuracies ----")
for model_name in y_predict_bow:
    print(f"{model_name}: {accuracy_score(y_test, y_predict_bow[model_name])*100:.2f}%")
    # print(f"{model_name}: {confusion_matrix(y_test, y_predict_bow[model_name])*100:.2f}%")



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Input
from tensorflow.keras.utils import to_categorical


In [None]:
ANN = Sequential([
    Input(shape=(X_train_bow.shape[1],), sparse=True),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(1, activation='sigmoid')
])
ANN.compile(optimizer='adam' , loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history=ANN.fit(X_train_bow, y_train, epochs=40, verbose=1 , validation_split=0.25, batch_size=122)

In [None]:
# ANN_tfidf = Sequential([
#     Input(shape=(X_train_tfidf.shape[1],), sparse=True),
#     Dense(128, activation='relu'),
#     Dropout(0.3),
#     Dense(64, activation='relu'),
#     Dropout(0.6),
#     Dense(32, activation='relu'),
#     Dropout(0.4),
#     Dense(16, activation='relu'),
#     Dropout(0.5),
#     Dense(1, activation='sigmoid'),
# ])
# ANN_tfidf = Sequential([
#     Input(shape=(X_train_tfidf.shape[1],), sparse=True),
#     Dense(64, activation='relu'),
#     Dropout(0.3),
#     Dense(1, activation='sigmoid')
# ])
from tensorflow.keras import regularizers
ANN_tfidf = Sequential([
    Input(shape=(X_train_tfidf.shape[1],), sparse=True),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
ANN_tfidf.compile(optimizer='adam' , loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

history_tfidf=ANN_tfidf.fit(X_train_tfidf, y_train, epochs=20, verbose=1 , validation_data=(X_test_tfidf, y_test), batch_size=128, callbacks=[es])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# ---------------------------
# 1. Tokenize and convert to sequences
# ---------------------------

max_words = 20000       # Vocabulary size
max_len = 100           # Maximum length of each sequence

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data['Sentence'])

X = tokenizer.texts_to_sequences(data['Sentence'])
X = pad_sequences(X, maxlen=max_len)

# y is your label column (0/1 or multi-class)
y = data['Sentiment']

# Trainâ€“test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# ---------------------------
# 2. Build RNN model
# ---------------------------

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # sigmoid for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# ---------------------------
# 3. Train
# ---------------------------

history = model.fit(
    X_train, y_train,
    validation_split=0.25,
    batch_size=32,
    epochs=10,
    verbose=1
)


In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", accuracy)


In [None]:
pip install transformers torch datasets accelerate


# Task
Improve the performance of the existing sentiment analysis models (Logistic Regression, Naive Bayes, SVM, ANN, LSTM) on the "Sentiment_Stock_data.csv" dataset by analyzing their current performance with detailed metrics, enhancing text preprocessing, implementing pre-trained word embeddings, performing hyperparameter tuning, addressing class imbalance, exploring advanced neural network architectures, using ensemble methods, and employing cross-validation for robust evaluation.

## Analyze Current Model Performance

### Subtask:
Evaluate the current models (Logistic Regression, Naive Bayes, SVM, ANN, LSTM) comprehensively using metrics like precision, recall, F1-score, and confusion matrices for each class.


**Reasoning**:
I need to evaluate each of the trained models (Logistic Regression, Naive Bayes, SVM, ANN, ANN_tfidf, and LSTM) using classification reports and confusion matrices. This involves iterating through the dictionary of models, making predictions, converting probabilities to binary labels for neural networks, and then printing the required metrics.



**Reasoning**:
The previous code failed because the `y_test` variable was overwritten by the LSTM's train-test split, leading to inconsistent sample sizes when evaluating models from the earlier (traditional) split. To fix this, I will re-create the `y_test` specifically for the traditional models (BoW/TF-IDF and their ANNs) using the same parameters as the original split, and use the existing global `y_test` for the LSTM model, ensuring consistent sample sizes for evaluation.

