In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')   # suppress warnings

In [2]:
# train data
df_train = pd.read_csv('Train.csv')
print(df_train.shape)
df_train.head()

(10001, 4)


Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [3]:
# test data
df_test = pd.read_csv('Test.csv')
print(df_test.shape)
df_test.head()

(5177, 2)


Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [4]:
# fill missing values
# label and aggreement are floating numbers, use mean and round
cols = ['label', 'agreement']
for col in cols:
    print(f'Column: {col}')
    mean_rounded = round(df_train[col].mean())
    print(f'Mean round: {mean_rounded}')
    df_train[col].fillna(mean_rounded, inplace=True)

Column: label
Mean round: 0
Column: agreement
Mean round: 1


In [5]:
# there are some inconsistencies, label values should either be 1, 0, -1

# Check the values in the target variable and handle any inconsistencies
df_train['label'] = df_train['label'].round().astype(int)

# Verify the counts of the target variable after handling inconsistencies
label_counts = df_train['label'].value_counts()
print(label_counts)

label
 0    4909
 1    4054
-1    1038
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Text Preprocessing
# (You may need to install nltk and download its data using nltk.download() for stopwords and WordNet)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
# downloading nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# initialize wordnet lemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [11]:
df_train['clean_text'] = df_train['safe_text'].apply(preprocess_text)

In [12]:
# Convert 'safe_text' column to strings
df_test['safe_text'] = df_test['safe_text'].astype(str)

# preprocess texts in 'safe_text' column in test_data
df_test['safe_text'] = df_test['safe_text'].apply(preprocess_text)

In [13]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_train['clean_text'], df_train['label'], test_size=0.2, random_state=42)

In [14]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
X_val_tfidf = vectorizer.transform(df_test["safe_text"])

In [16]:
# Model Training - Gradient Boosting Machine
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm.fit(X_train_tfidf, y_train)

In [17]:
# Model Evaluation
y_pred_val = gbm.predict(X_test_tfidf)
print("Validation Set Performance: ")
print(classification_report(y_test, y_pred_val))

Validation Set Performance: 
              precision    recall  f1-score   support

          -1       0.80      0.17      0.28       231
           0       0.79      0.79      0.79       988
           1       0.63      0.77      0.69       782

    accuracy                           0.71      2001
   macro avg       0.74      0.58      0.59      2001
weighted avg       0.73      0.71      0.70      2001



In [18]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred_val)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 71.41429285357322


In [19]:
# model deployment: predict sentiments of test data
#y_pred_test = gbm.predict(X_val_tfidf)

In [20]:
# save predictions to submission.csv
#submission_df = pd.DataFrame(
#    {
#        'tweet_id': df_test['tweet_id'],
#        'label': y_pred_test
#    }
#)
#submission_df.to_csv('submission_t1.csv', index=False)
#print("Submission file save successfully.")

#### Hyperparameter Tunning

In [21]:
import optuna
from xgboost import XGBClassifier

##### XGBoostClassifier

In [28]:
# Define objective function for Optuna
def objective(trial):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'gamma': trial.suggest_loguniform('gamma', 0.1, 10),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10)
    }

    model = XGBClassifier(**params)
    model.fit(X_train_tfidf, y_train.map({-1: 0, 0: 1, 1: 2}))  # Map classes to [0, 1, 2]

    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [29]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-03-25 00:39:14,418] A new study created in memory with name: no-name-3ed378d9-e1f0-4bd0-bf15-6649faecfbd2
[I 2024-03-25 00:39:25,466] Trial 0 finished with value: 0.09395302348825588 and parameters: {'learning_rate': 0.08072747633406846, 'max_depth': 4, 'min_child_weight': 0.40256743416589136, 'subsample': 0.6897215217704735, 'colsample_bytree': 0.5178869788111966, 'n_estimators': 158, 'gamma': 1.6292676518910794, 'reg_alpha': 6.20654556791987, 'reg_lambda': 9.666452382617248}. Best is trial 0 with value: 0.09395302348825588.
[I 2024-03-25 00:39:36,766] Trial 1 finished with value: 0.09845077461269365 and parameters: {'learning_rate': 0.010626233444156075, 'max_depth': 3, 'min_child_weight': 2.148717819210923, 'subsample': 0.6882481640982886, 'colsample_bytree': 0.6773917739160715, 'n_estimators': 181, 'gamma': 4.548626824229091, 'reg_alpha': 1.8524142341554055, 'reg_lambda': 0.2633547354844357}. Best is trial 1 with value: 0.09845077461269365.
[I 2024-03-25 00:39:41,821] Trial

In [30]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.011976896013040781, 'max_depth': 3, 'min_child_weight': 0.13713808756386464, 'subsample': 0.5602787791222266, 'colsample_bytree': 0.6182962895656772, 'n_estimators': 199, 'gamma': 3.320155884046977, 'reg_alpha': 1.3428287639007912, 'reg_lambda': 0.2659256220953253}


In [31]:
# Train the final model with the best hyperparameters
best_model = XGBClassifier(**best_params)
best_model.fit(X_train_tfidf, y_train.map({-1: 0, 0: 1, 1: 2}))

In [32]:
# Model Evaluation
y_pred_val = best_model.predict(X_test_tfidf)
print("Validation Set Performance after tuning: ")
print(classification_report(y_test, y_pred_val))

Validation Set Performance after tuning: 
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       231
           0       0.11      0.00      0.00       988
           1       0.23      0.31      0.26       782
           2       0.00      0.00      0.00         0

    accuracy                           0.12      2001
   macro avg       0.08      0.08      0.07      2001
weighted avg       0.14      0.12      0.10      2001



In [33]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred_val)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 12.043978010994502


##### GradientBoostingClassifier

In [34]:
# Define objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
    }
    
    gb_classifier = GradientBoostingClassifier(**params)
    gb_classifier.fit(X_train_tfidf, y_train)
    
    y_pred_val = gb_classifier.predict(X_test_tfidf)
    accuracy = (y_pred_val == y_test).mean()
    
    return accuracy

In [35]:
# Perform hyperparameter optimization using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-03-25 01:17:58,272] A new study created in memory with name: no-name-c3395c2e-d4a2-4978-911d-645f55cbb9d5
[I 2024-03-25 01:21:35,808] Trial 0 finished with value: 0.7296351824087957 and parameters: {'n_estimators': 121, 'learning_rate': 0.049697269869257835, 'max_depth': 9, 'min_samples_split': 11, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7296351824087957.
[I 2024-03-25 01:25:10,874] Trial 1 finished with value: 0.7151424287856072 and parameters: {'n_estimators': 85, 'learning_rate': 0.015957547736841257, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7296351824087957.
[I 2024-03-25 01:26:42,595] Trial 2 finished with value: 0.6746626686656672 and parameters: {'n_estimators': 187, 'learning_rate': 0.012000791398614263, 'max_depth': 3, 'min_samples_split': 20, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.7296351824087957.
[I 2024-03-25 01:28:47,681] Trial 3 finished with value: 0.7296351824087957 and paramete

In [36]:
# Get the best hyperparameters
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'n_estimators': 189, 'learning_rate': 0.21854354039062007, 'max_depth': 8, 'min_samples_split': 14, 'min_samples_leaf': 6}


In [37]:
# Train the final model with the best hyperparameters
best_gb_model = GradientBoostingClassifier(**best_params)
best_gb_model.fit(X_train_tfidf, y_train)

In [38]:
# Model Evaluation
y_pred_val = best_gb_model.predict(X_test_tfidf)
print("Validation Set Performance: ")
print(classification_report(y_test, y_pred_val))

Validation Set Performance: 
              precision    recall  f1-score   support

          -1       0.64      0.33      0.44       231
           0       0.81      0.80      0.80       988
           1       0.67      0.77      0.72       782

    accuracy                           0.74      2001
   macro avg       0.70      0.63      0.65      2001
weighted avg       0.73      0.74      0.73      2001



In [39]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred_val)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 73.51324337831085
