In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/content/Preprocessed_data.csv', index_col=0)
df.tail(30)

Unnamed: 0,text,original_text,listed_emotions,emotion_count,labels,encoded_labels
206916,“ unspeakable scientifical horror ” lol people,You had me at “unspeakable scientifical horror...,neutral,1,neutral,1
206919,“ use work remember policy ” nail,“I use to work here and I remember our policy ...,neutral,1,neutral,1
206922,“ verge exploding pant ” “ hehe youre cute ”,“I am on the verge of exploding in my pants” “...,neutral,1,neutral,1
206927,“ want denver ” hooray hire best guy instead,So he “wants to be in Denver”. Hooray. Can we ...,admiration,1,positive,2
206932,“ want kid ” given ovary lol,“But [NAME] wants you to have kids!” [NAME] sh...,amusement,1,positive,2
206935,“ want stay night late mom would worry since d...,"“I want to stay the night, but it’s late and m...",neutral,1,neutral,1
206938,“ weird thing shes got tattoo company back ” l...,“The weird thing is she’s got a tattoo of our ...,neutral,1,neutral,1
206943,“ welcome show here lunch ” probably,“Welcome to the show. Here’s your lunch” - [NA...,excitement,1,positive,2
206948,“ whine ” emotive term use would ironic youve ...,“Whine” is such an *emotive* term to use. It w...,amusement,1,positive,2
206953,“ wish way know good old day actually left ”,“I wish there was a way to know you're in the ...,desire,1,negative,0


In [3]:
df.isna().sum()

text               0
original_text      0
listed_emotions    0
emotion_count      0
labels             0
encoded_labels     0
dtype: int64

It appears that the original dataframe had texts that were entirely made of stopwords, so after cleaning, the rows with such texts were left with missing values, therefore we proceed with dropping the missing values.

In [4]:
df.duplicated().sum()

0

In [5]:
final_df = df[['text', 'encoded_labels']]
final_df.head()

Unnamed: 0,text,encoded_labels
0,aa b whole meritocracy gtfo,1
3,aaaaaaaaaaaaaahhh imagine,1
8,aaaaaaaaaaaaand boop,0
11,aaaaaand feeling morning,1
14,aaaaaand soccer hopefully,1


In [6]:
import pandas as pd

# Load AFINN lexicon into a Python dictionary
def load_afinn_lexicon(file_path):
    afinn = {}
    with open(file_path, 'r') as file:
        for line in file:
            term, score = line.strip().split('\t')
            afinn[term] = int(score)
    return afinn

# Function to calculate the sentiment score of a sentence using the AFINN lexicon
def calculate_sentiment_score(sentence, afinn_lexicon):
    words = sentence.split()
    sentiment_score = sum(afinn_lexicon.get(word, 0) for word in words)
    return sentiment_score

# Load the AFINN lexicon
afinn_file_path = '/content/AFINN-en-165.txt'
afinn_lexicon = load_afinn_lexicon(afinn_file_path)

# Create a new column 'sentiment_score' to store the sentiment score for each text
final_df['sentiment_score'] = final_df['text'].apply(lambda x: calculate_sentiment_score(x, afinn_lexicon))

# Create a new column 'sentiment_label' based on the sentiment score
final_df.loc[:, 'sentiment_label'] = final_df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

# Display the DataFrame with the newly added features
print(final_df.head())




                           text  encoded_labels  sentiment_score  \
0   aa b whole meritocracy gtfo               1                0   
3     aaaaaaaaaaaaaahhh imagine               1                0   
8          aaaaaaaaaaaaand boop               0                0   
11     aaaaaand feeling morning               1                1   
14    aaaaaand soccer hopefully               1                2   

   sentiment_label  
0          neutral  
3          neutral  
8          neutral  
11        positive  
14        positive  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment_score'] = final_df['text'].apply(lambda x: calculate_sentiment_score(x, afinn_lexicon))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, 'sentiment_label'] = final_df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')


In [7]:
final_df['sentiment_label'].value_counts()

positive    24886
negative    16406
neutral     14576
Name: sentiment_label, dtype: int64

In [10]:
# Addressing class imbalance using SMOTE
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import scipy.sparse as sp
import pickle
# Preprocess the text (if needed) and split the data into training and testing sets
X_text = final_df['text']
X_sentiment = final_df['sentiment_score']  # Use the sentiment scores as additional feature
y = final_df['sentiment_label']

# Split the data into 70% training and 30% testing
X_text_train, X_text_test, X_sentiment_train, X_sentiment_test, y_train, y_test = train_test_split(
    X_text, X_sentiment, y, test_size=0.3, random_state=42
)

# Vectorize the text data using TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1001)  # You can adjust the number of features as needed
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)


# Save the trained TF-IDF vectorizer using joblib.dump()
tfidf_vectorizer_filename = 'tf-idf_vectorizer'
pickle.dump(tfidf_vectorizer, open(tfidf_vectorizer_filename, ('wb')))

# Combine TF-IDF features with sentiment scores as additional features
X_train = sp.hstack([X_text_train_tfidf, X_sentiment_train.values.reshape(-1, 1)], format='csr')
X_test = sp.hstack([X_text_test_tfidf, X_sentiment_test.values.reshape(-1, 1)], format='csr')

# Apply SMOTE to handle class imbalances
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Train an SGD classifier (instead of SVM) on the resampled data
sgd_classifier = SGDClassifier(loss='hinge', random_state=42)  # Use hinge loss for linear SVM
sgd_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = sgd_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

classification_report_result = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_report_result)


Accuracy: 0.96
Precision: 0.97
Recall: 0.96
F1-Score: 0.96
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      1.00      0.95      4894
     neutral       0.99      0.88      0.93      4403
    positive       1.00      0.99      1.00      7464

    accuracy                           0.96     16761
   macro avg       0.96      0.96      0.96     16761
weighted avg       0.97      0.96      0.96     16761



In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions on the train set
y_train_pred = sgd_classifier.predict(X_train_resampled)

# Evaluate the model on the train set
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_precision = precision_score(y_train_resampled, y_train_pred, average='weighted')
train_recall = recall_score(y_train_resampled, y_train_pred, average='weighted')
train_f1 = f1_score(y_train_resampled, y_train_pred, average='weighted')

print("Train Set Evaluation:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1-Score: {train_f1:.2f}")

classification_report_train = classification_report(y_train_resampled, y_train_pred)
print("Classification Report (Train Set):\n", classification_report_train)


Train Set Evaluation:
Accuracy: 0.97
Precision: 0.97
Recall: 0.97
F1-Score: 0.97
Classification Report (Train Set):
               precision    recall  f1-score   support

    negative       0.92      1.00      0.96     17422
     neutral       0.99      0.91      0.95     17422
    positive       1.00      0.99      1.00     17422

    accuracy                           0.97     52266
   macro avg       0.97      0.97      0.97     52266
weighted avg       0.97      0.97      0.97     52266



In [12]:
import pickle
filename = 'SVM_model'
pickle.dump(sgd_classifier, open(filename, 'wb'))