In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report



In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#grabbing stopwords from the stop words library because we want to go through our data set find these words and filter them out

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
file_path = 'C:/Users/USER/Desktop/personal/learning/sentiment-analysis/api/ai/archive (1)/train.txt'
train_df = pd.read_csv(file_path, sep= ";", header= None, names=["Text", "Emotion"])
file_path = 'C:/Users/USER/Desktop/personal/learning/sentiment-analysis/api/ai/archive (1)/val.txt'
val_df = pd.read_csv(file_path, sep= ";", header= None, names=["Text", "Emotion"])
file_path = 'C:/Users/USER/Desktop/personal/learning/sentiment-analysis/api/ai/archive (1)/test.txt'
test_df = pd.read_csv(file_path, sep= ";", header= None, names=["Text", "Emotion"])

#csv to df


In [5]:
from collections import Counter

# Count occurrences
val_counts = Counter(val_df['Emotion'])
train_counts = Counter(train_df['Emotion'])
test_counts = Counter(test_df['Emotion'])
print(val_counts, train_counts, test_counts)



Counter({'joy': 704, 'sadness': 550, 'anger': 275, 'fear': 212, 'love': 178, 'surprise': 81}) Counter({'joy': 5362, 'sadness': 4666, 'anger': 2159, 'fear': 1937, 'love': 1304, 'surprise': 572}) Counter({'joy': 695, 'sadness': 581, 'anger': 275, 'fear': 224, 'love': 159, 'surprise': 66})


In [7]:
val_df = val_df.query('Emotion != "surprise" and Emotion != "love"')
train_df = train_df.query('Emotion != "surprise" and Emotion != "love"')

test_df = test_df.query('Emotion != "surprise" and Emotion != "love"')

In [8]:
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text) # Remove non-alphabet characters
    text = text.lower() # Convert to lowercase
    text = text.split() # Split into words
    text = [word for word in text if word not in stop_words] # Remove stopwords
    return ' '.join(text)

In [26]:
print(train_df)

                                                    Text  Emotion
0                                  didnt feel humiliated  sadness
1      go feeling hopeless damned hopeful around some...  sadness
2              im grabbing minute post feel greedy wrong    anger
4                                        feeling grouchy    anger
5          ive feeling little burdened lately wasnt sure  sadness
...                                                  ...      ...
15995      brief time beanbag said anna feel like beaten  sadness
15996  turning feel pathetic still waiting tables sub...  sadness
15997                           feel strong good overall      joy
15998                     feel like rude comment im glad    anger
15999                       know lot feel stupid portray  sadness

[14124 rows x 2 columns]


In [9]:
train_df.loc[:,'Text'] = train_df['Text'].apply(preprocess_text)
val_df.loc[:,'Text'] = val_df['Text'].apply(preprocess_text)
test_df.loc[:,'Text'] = test_df['Text'].apply(preprocess_text)

In [27]:
print(train_df)

                                                    Text  Emotion
0                                  didnt feel humiliated  sadness
1      go feeling hopeless damned hopeful around some...  sadness
2              im grabbing minute post feel greedy wrong    anger
4                                        feeling grouchy    anger
5          ive feeling little burdened lately wasnt sure  sadness
...                                                  ...      ...
15995      brief time beanbag said anna feel like beaten  sadness
15996  turning feel pathetic still waiting tables sub...  sadness
15997                           feel strong good overall      joy
15998                     feel like rude comment im glad    anger
15999                       know lot feel stupid portray  sadness

[14124 rows x 2 columns]


In [10]:
vectorizer = CountVectorizer(max_features=3000)
svm = SVC(kernel="linear", C=0.5, random_state=42)
logistic = LogisticRegression(random_state=42, max_iter=1000)

In [11]:
svm_pipeline = Pipeline([("vectorizer", vectorizer), ("svm", svm)])
logistic_pipeline = Pipeline([("vectorizer", vectorizer), ("logistic", logistic)])

In [12]:
voting_classifier = VotingClassifier(
    estimators=[
        ("svm", svm_pipeline),
        ("logistic", logistic_pipeline)
    ],
    voting='hard'
)

In [13]:
voting_classifier.fit(train_df['Text'], train_df['Emotion'])

In [14]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    train_acc = accuracy_score(y_train, train_pred)
    val_acc = accuracy_score(y_val, val_pred)
    test_acc = accuracy_score(y_test, test_pred)
    
    print(f"Train Accuracy: {train_acc:.2f}")
    print(f"Validation Accuracy: {val_acc:.2f}")
    print(f"Test Accuracy: {test_acc:.2f}")
    
    return train_pred, val_pred, test_pred

# Evaluate the VotingClassifier
train_pred, val_pred, test_pred = evaluate_model(
    voting_classifier,
    train_df['Text'], train_df['Emotion'],
    val_df['Text'], val_df['Emotion'],
    test_df['Text'], test_df['Emotion']
)

Train Accuracy: 0.98
Validation Accuracy: 0.93
Test Accuracy: 0.94


In [17]:
train_true=train_df['Emotion']
val_true=val_df['Text']
test_true=test_df['Text']
labels=    voting_classifier.classes_


0        sadness
1        sadness
2          anger
4          anger
5        sadness
          ...   
15995    sadness
15996    sadness
15997        joy
15998      anger
15999    sadness
Name: Emotion, Length: 14124, dtype: object


In [20]:

subset_test_df = test_df.sample(n=1000, random_state=42)  # Sample a subset for faster visualization

# Predictions for subset
subset_test_preds = voting_classifier.predict(subset_test_df['Text'])

In [22]:


test_acc = accuracy_score(test_df['Emotion'], test_pred)
print(test_acc)

0.9352112676056338


In [23]:
class_report = classification_report(test_df['Emotion'], test_pred, target_names=labels)

In [25]:


texts = [
    "I had a bad day at work",
    "I miss my ex"
]

for custom_text in texts:
    processed_text = preprocess_text(custom_text)
    predicted_emotion = voting_classifier.predict([processed_text])
    print(f"Text: {custom_text}")
    print(f"Predicted Emotion: {predicted_emotion[0]}")

Text: I had a bad day at work
Predicted Emotion: sadness
Text: I miss my ex
Predicted Emotion: joy
