In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')   # suppress warnings

In [2]:
# train data
df_train = pd.read_csv('Train.csv')
print(df_train.shape)
df_train.head()

(10001, 4)


Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [3]:
# test data
df_test = pd.read_csv('Test.csv')
print(df_test.shape)
df_test.head()

(5177, 2)


Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [4]:
# fill missing values
# label and aggreement are floating numbers, use mean and round
cols = ['label', 'agreement']
for col in cols:
    print(f'Column: {col}')
    mean_rounded = round(df_train[col].mean())
    print(f'Mean round: {mean_rounded}')
    df_train[col].fillna(mean_rounded, inplace=True)

Column: label
Mean round: 0
Column: agreement
Mean round: 1


In [5]:
# there are some inconsistencies, label values should either be 1, 0, -1

# Check the values in the target variable and handle any inconsistencies
df_train['label'] = df_train['label'].round().astype(int)

# Verify the counts of the target variable after handling inconsistencies
label_counts = df_train['label'].value_counts()
print(label_counts)

label
 0    4909
 1    4054
-1    1038
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Text Preprocessing
# (You may need to install nltk and download its data using nltk.download() for stopwords and WordNet)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
# downloading nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# initialize wordnet lemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [11]:
df_train['clean_text'] = df_train['safe_text'].apply(preprocess_text)

In [12]:
# Convert 'safe_text' column to strings
df_test['safe_text'] = df_test['safe_text'].astype(str)

# preprocess texts in 'safe_text' column in test_data
df_test['safe_text'] = df_test['safe_text'].apply(preprocess_text)

In [13]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_train['clean_text'], df_train['label'], test_size=0.2, random_state=42)

In [14]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
X_val_tfidf = vectorizer.transform(df_test["safe_text"])

In [16]:
# Model Training (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

In [17]:
# Model Evaluation
y_pred = model.predict(X_test_tfidf)
print("Validation Set Performance: ")
print(classification_report(y_test, y_pred))

Validation Set Performance: 
              precision    recall  f1-score   support

          -1       0.79      0.15      0.25       231
           0       0.80      0.80      0.80       988
           1       0.64      0.80      0.71       782

    accuracy                           0.72      2001
   macro avg       0.75      0.58      0.59      2001
weighted avg       0.74      0.72      0.70      2001



In [18]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 72.36381809095452


In [19]:
# model deployment: predict sentiments of test data
#y_pred_test = model.predict(X_val_tfidf)

In [20]:
# save predictions to submission.csv
#submission_df = pd.DataFrame(
#    {
#        'tweet_id': df_test['tweet_id'],
#        'label': y_pred_test
#    }
#)
#submission_df.to_csv('submission_t1.csv', index=False)
#print("Submission file save successfully.")

#### Hyperparameter tunning

In [21]:
import optuna

In [22]:
# Define objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train_tfidf, y_train)

    y_pred_val = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred_val)
    return accuracy

In [23]:
# Perform hyperparameter optimization using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-03-24 22:10:44,559] A new study created in memory with name: no-name-80bbeb52-f1ac-4a90-9c2a-e68bb8459d2d
[I 2024-03-24 22:10:48,838] Trial 0 finished with value: 0.6816591704147926 and parameters: {'n_estimators': 199, 'max_depth': 22, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.6816591704147926.
[I 2024-03-24 22:10:51,955] Trial 1 finished with value: 0.6621689155422289 and parameters: {'n_estimators': 151, 'max_depth': 16, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.6816591704147926.
[I 2024-03-24 22:10:54,401] Trial 2 finished with value: 0.6806596701649176 and parameters: {'n_estimators': 88, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.6816591704147926.
[I 2024-03-24 22:10:57,078] Trial 3 finished with value: 0.6591704147926037 and parameters: {'n_estimators': 144, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value:

In [24]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'n_estimators': 69, 'max_depth': 28, 'min_samples_split': 8, 'min_samples_leaf': 1}


In [25]:
# Model Training with best parameters
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_tfidf, y_train)

In [28]:
# Model Evaluation
y_pred_val = best_model.predict(X_test_tfidf)
print("Validation Set Performance after tuning: ")
print(classification_report(y_test, y_pred_val))

Validation Set Performance after tuning: 
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       231
           0       0.72      0.86      0.78       988
           1       0.65      0.68      0.66       782

    accuracy                           0.69      2001
   macro avg       0.46      0.51      0.48      2001
weighted avg       0.61      0.69      0.65      2001



In [29]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred_val)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 68.91554222888556
