# Feature Engineering

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
# Load "sentiment_results.csv" and display a random sample of 10 rows

results_df = pd.read_csv('../dataset/sentiment_results.csv')
results_df.sample(10)

Unnamed: 0,corpus_name,raw_sentence,clean_text,textblob_polarity,sentiment_textblob
440404,sentiment140,@liiaszta abisan si brian nge-reply ke nicky nyebut2 westlife li hehehe iya kno ya dia harus keluar,abisan si brian nge reply ke nicky nyebut westlife li hehehe iya kno ya dia harus keluar,0.0,Neutral
382527,sentiment140,waiting for @shanedawson 's tweets,waiting tweet,0.0,Neutral
1228749,sentiment140,"@mgmyself Doing great, beautiful, thank you I do have a lot of work to do, but sort of conflicted as to what to tackle first.",great beautiful thank lot work sort conflicted tackle first,0.63,Positive
1649840,sentiment140,"good morning to you, good morning to you--we're all in our places with bright shining faces...Good morning tweeters!",good morning place bright shining face tweeter,0.7,Positive
840313,sentiment140,I will never again go to see a movie without first reading matt mungle's review. Transformers...,never go see movie without first reading matt mungle review transformer,0.25,Moderately Positive
1411533,sentiment140,@IDRtakeover bahaha thanks pastor!! @joshohaire thanks!,bahaha thanks pastor,0.2,Moderately Positive
1058423,sentiment140,@5toSucceed wheres my present?,wheres present,0.0,Neutral
180627,sentiment140,I hate daytime telly! And theres nothing to eat.And Jenni put me in the mood for a pizza,hate daytime telly there nothing eat jenni put mood pizza,-0.8,Negative
880910,sentiment140,Good night everyone! See u at the pool,good night everyone see pool,0.7,Positive
262452,sentiment140,"Needs a little trip , Orlando maybe , wish I could see my family and dog in GA",need little trip orlando maybe wish could see family dog ga,-0.19,Moderately Negative


In [3]:
results_df["sentiment_textblob"].value_counts()

sentiment_textblob
Neutral                637477
Moderately Positive    464854
Positive               243252
Moderately Negative    202678
Negative               103233
Name: count, dtype: int64

To address the imbalance, we will perform balancing by equalizing the counts across categories, ensuring that they have the same number of instances.

In [4]:
# Get the count of the "Negative" sentiment category
negative_count = results_df["sentiment_textblob"].value_counts()["Negative"]

# Sample 103,233 rows from each sentiment category
balanced_df = pd.concat([
    results_df[results_df["sentiment_textblob"] == "Neutral"].sample(n=negative_count, random_state=42),
    results_df[results_df["sentiment_textblob"] == "Moderately Positive"].sample(n=negative_count, random_state=42),
    results_df[results_df["sentiment_textblob"] == "Positive"].sample(n=negative_count, random_state=42),
    results_df[results_df["sentiment_textblob"] == "Moderately Negative"].sample(n=negative_count, random_state=42),
    results_df[results_df["sentiment_textblob"] == "Negative"]
])

# Shuffle the rows of the balanced dataframe
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the count of each sentiment category in the balanced dataframe
print(balanced_df["sentiment_textblob"].value_counts())

sentiment_textblob
Moderately Negative    103233
Neutral                103233
Positive               103233
Moderately Positive    103233
Negative               103233
Name: count, dtype: int64


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [6]:
X = balanced_df['clean_text']
y = balanced_df['sentiment_textblob']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)