In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score


## Data Ingestion

In [2]:
DATA_URL = "https://raw.githubusercontent.com/entbappy/Branching-tutorial/refs/heads/master/tweet_emotions.csv"

df = pd.read_csv(DATA_URL)
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
# delete tweet id
df.drop(columns=['tweet_id'], inplace=True)
df

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,neutral,@JohnLloydTaylor
39996,love,Happy Mothers Day All my love
39997,love,Happy Mother's Day to all the mommies out ther...
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [4]:
# select sentiment categories
label_categories = ['happiness', 'sadness']
final_df = df[df['sentiment'].isin(label_categories)]

In [5]:
final_df.sample(5)

Unnamed: 0,sentiment,content
27061,happiness,@djnvs LoL! there u go..that's the spirit haha
7055,sadness,Regrettin some of the decisions I made
21502,happiness,THE best job in the world &amp; yes its in Aus...
2130,sadness,@OpheliaPunk i wish i wasn't all bummed but i...
33767,happiness,"@tabithalynnne lmao, yep"


In [6]:
# encode sentiment labels
final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True)
  final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True)


In [7]:
final_df.head()

Unnamed: 0,sentiment,content
1,0,Layin n bed with a headache ughhhh...waitin o...
2,0,Funeral ceremony...gloomy friday...
6,0,"I should be sleep, but im not! thinking about ..."
8,0,@charviray Charlene my love. I miss you
9,0,@kelcouch I'm sorry at least it's Friday?


In [8]:
train_data, test_data = train_test_split(final_df, test_size=0.2, random_state=42)  

## Data Preprocessing

In [9]:
nltk.download('stopwords', download_dir='/nltk_data/')
nltk.download('wordnet', download_dir='/nltk_data/')
nltk.download("punkt", download_dir='/nltk_data/')
nltk.download('punkt_tab', download_dir='/nltk_data/')

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    return ' '.join(tokens)


def remove_numbers(text):
    return re.sub(r'\d+', '', text)


def lowercase_text(text):
    ## Convert text to lowercase
    return text.lower()


def remove_punctuation(text):
    ## Remove punctuation
    return text.translate(str.maketrans('', '', string.punctuation))


def remove_urls(text):
    ## Remove URLs
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_small_sentences(df, min_length=3):
    ## Remove sentences shorter than min_length
    df['content'] = df['content'].apply(lambda x: ''.join(x.split()) if len(x.split()) >= min_length else '')
    
    return df[df['content'].str.strip().astype(bool)]  # Remove empty rows

def normalize_text(df):
    ## Normalize contents of the DataFrame
    df['content'] = df['content'].apply(lowercase_text)
    df['content'] = df['content'].apply(remove_numbers)
    df['content'] = df['content'].apply(remove_punctuation)
    df['content'] = df['content'].apply(remove_urls)
    df['content'] = df['content'].apply(remove_stop_words)
    df['content'] = df['content'].apply(lemmatization)
    return df

def normalize_sentence(sentence):
    ## Normalize a single sentence
    sentence = lowercase_text(sentence)
    sentence = remove_numbers(sentence)
    sentence = remove_punctuation(sentence)
    sentence = remove_urls(sentence)
    sentence = remove_stop_words(sentence)
    sentence = lemmatization(sentence)
    return sentence



[nltk_data] Downloading package stopwords to /nltk_data/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /nltk_data/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /nltk_data/...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
test_string = "I am number 1 in the world! #1 ? Let's celebrate 2023 with joy and happiness! :) 100% guaranteed success."
cleaned_string = remove_numbers(test_string)
print(cleaned_string)

cleaned_string = ''.join([i for i in test_string if not i.isdigit()])
print(cleaned_string)

test_string = test_string.translate(str.maketrans('', '', string.punctuation))
print(test_string)

norm_string = normalize_sentence(test_string)
print(norm_string)

I am number  in the world! # ? Let's celebrate  with joy and happiness! :) % guaranteed success.
I am number  in the world! # ? Let's celebrate  with joy and happiness! :) % guaranteed success.
I am number 1 in the world 1  Lets celebrate 2023 with joy and happiness  100 guaranteed success
number world let celebrate joy happiness guaranteed success


In [11]:
train_data = normalize_text(train_data)
test_data = normalize_text(test_data)

In [16]:
train_data["content"].iloc[100]

'many farewell party sad see people leaving'

In [17]:
train_data

Unnamed: 0,sentiment,content
23531,0,quotmy problem isnt miss cause dontquot
8051,0,thats done already one proof there nothing fai...
11499,0,hungry food steal
31288,1,foot hurtfinally bedwill forget crunch overver...
18561,0,really ill atm
...,...,...
21697,1,chocolatesuze yes yes especially wine mushroom...
19445,0,kickzfadayz boy better get tonight
20216,1,tafe actually quite good
3258,0,minute boarding hour home window seat


## Feature Engineering

In [18]:
x_train = train_data['content'].values
y_train = train_data['sentiment'].values

x_test = test_data['content'].values
y_test = test_data['sentiment'].values

In [19]:
# Apply Bag of words (CountVectorizer)

vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform both train and test data
X_train_bow = vectorizer.fit_transform(x_train)
X_test_bow = vectorizer.transform(x_test)


In [23]:
train_df = pd.DataFrame(X_train_bow.toarray())
#train_df.columns = vectorizer.get_feature_names_out()

train_df["label"] = y_train
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14937,14938,14939,14940,14941,14942,14943,14944,14945,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model Building

In [26]:
# Define and train the model XGBoost model

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

xgb_model.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred, target_names=['sadness', 'happiness'])
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7730120481927711
Classification Report:
              precision    recall  f1-score   support

     sadness       0.81      0.72      0.76      1060
   happiness       0.74      0.83      0.78      1015

    accuracy                           0.77      2075
   macro avg       0.78      0.77      0.77      2075
weighted avg       0.78      0.77      0.77      2075



## Model Prediction

In [32]:
y_test

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [29]:
# Make predictions on new data
x_pred = xgb_model.predict(X_test_bow)
y_pred_proba = xgb_model.predict_proba(X_test_bow)

#  Calculate precision, recall, and ROC AUC
precision = precision_score(y_test, x_pred)
recall = recall_score(y_test, x_pred)
f1 = f1_score(y_test, x_pred)
auc = roc_auc_score(y_test, y_pred_proba[:, 1])

In [30]:
y_pred_proba

array([[0.56454545, 0.43545455],
       [0.8691249 , 0.1308751 ],
       [0.97665673, 0.02334325],
       ...,
       [0.45226395, 0.54773605],
       [0.8400142 , 0.15998581],
       [0.035272  , 0.964728  ]], dtype=float32)

In [31]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {auc}")

Precision: 0.7394366197183099
Recall: 0.8275862068965517
F1 Score: 0.7810320781032078
ROC AUC: 0.8616790593921368
