In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')   # suppress warnings

In [2]:
# train data
df_train = pd.read_csv('Train.csv')
print(df_train.shape)
df_train.head()

(10001, 4)


Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [3]:
# test data
df_test = pd.read_csv('Test.csv')
print(df_test.shape)
df_test.head()

(5177, 2)


Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [4]:
# fill missing values
# label and aggreement are floating numbers, use mean and round
cols = ['label', 'agreement']
for col in cols:
    print(f'Column: {col}')
    mean_rounded = round(df_train[col].mean())
    print(f'Mean round: {mean_rounded}')
    df_train[col].fillna(mean_rounded, inplace=True)

Column: label
Mean round: 0
Column: agreement
Mean round: 1


In [5]:
# there are some inconsistencies, label values should either be 1, 0, -1

# Check the values in the target variable and handle any inconsistencies
df_train['label'] = df_train['label'].round().astype(int)

# Verify the counts of the target variable after handling inconsistencies
label_counts = df_train['label'].value_counts()
print(label_counts)

label
 0    4909
 1    4054
-1    1038
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# downloading nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [9]:
# initialize wordnet lemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [11]:
df_train['clean_text'] = df_train['safe_text'].apply(preprocess_text)

In [12]:
# Convert 'safe_text' column to strings
df_test['safe_text'] = df_test['safe_text'].astype(str)

# preprocess texts in 'safe_text' column in test_data
df_test['safe_text'] = df_test['safe_text'].apply(preprocess_text)

In [13]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_train['clean_text'], 
    df_train['label'], 
    test_size=0.2, 
    random_state=42
)

In [14]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
X_val_tfidf = vectorizer.transform(df_test["safe_text"])

In [16]:
# Model 1 Training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [17]:
# Model Evaluation
y_pred = model.predict(X_test_tfidf)
print("Validation Set Performance: ")
print(classification_report(y_test, y_pred))

Validation Set Performance: 
              precision    recall  f1-score   support

          -1       0.69      0.19      0.30       231
           0       0.78      0.82      0.80       988
           1       0.67      0.77      0.72       782

    accuracy                           0.73      2001
   macro avg       0.71      0.59      0.61      2001
weighted avg       0.73      0.73      0.71      2001



In [18]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 72.86356821589204


In [19]:
# model deployment: predict sentiments of test data
y_pred_test = model.predict(X_val_tfidf)

In [None]:
# save predictions to submission.csv
'''
submission_df = pd.DataFrame(
    {
        'tweet_id': df_test['tweet_id'],
        'label': y_pred_test
    }
)'''
#submission_df.to_csv('submission_t1.csv', index=False)
#print("Submission file save successfully.")

#### Hyperparameter tunning

In [20]:
import optuna

In [29]:
# Define objective function for Optuna
def objective(trial):
    # Define hyperparameters to search
    params = {
        'C': trial.suggest_loguniform('C', 0.001, 100),
        'penalty': trial.suggest_categorical('penalty', ['l2'])
    }

    # Initialize and train the model with the suggested hyperparameters
    model = LogisticRegression(max_iter=1000, solver='lbfgs', **params)
    model.fit(X_train_tfidf, y_train)

    # Predict on validation set and calculate accuracy
    y_pred_val = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred_val)

    return accuracy

In [38]:
# Perform hyperparameter optimization using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2024-03-24 19:06:07,563] A new study created in memory with name: no-name-7a41ff93-516e-4da2-a8a8-a33f9d78dec5
[I 2024-03-24 19:06:07,750] Trial 0 finished with value: 0.496751624187906 and parameters: {'C': 0.0019592611450806475, 'penalty': 'l2'}. Best is trial 0 with value: 0.496751624187906.
[I 2024-03-24 19:06:09,217] Trial 1 finished with value: 0.719640179910045 and parameters: {'C': 8.875887112603387, 'penalty': 'l2'}. Best is trial 1 with value: 0.719640179910045.
[I 2024-03-24 19:06:11,050] Trial 2 finished with value: 0.7006496751624188 and parameters: {'C': 23.665987465330012, 'penalty': 'l2'}. Best is trial 1 with value: 0.719640179910045.
[I 2024-03-24 19:06:11,274] Trial 3 finished with value: 0.6981509245377311 and parameters: {'C': 0.08120126779771857, 'penalty': 'l2'}. Best is trial 1 with value: 0.719640179910045.
[I 2024-03-24 19:06:13,859] Trial 4 finished with value: 0.6851574212893553 and parameters: {'C': 57.20478742259906, 'penalty': 'l2'}. Best is trial 1 wi

In [39]:
# Get the best hyperparameters
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'C': 2.3206854904217935, 'penalty': 'l2'}


In [40]:
# Train the final model with the best hyperparameters
best_model = LogisticRegression(max_iter=1000, solver='lbfgs', **best_params)
best_model.fit(X_train_tfidf, y_train)

In [41]:
# Model Evaluation
y_pred_val = best_model.predict(X_test_tfidf)
print("Validation Set Performance after tuning: ")
print(classification_report(y_test, y_pred_val))

Validation Set Performance after tuning: 
              precision    recall  f1-score   support

          -1       0.65      0.29      0.40       231
           0       0.79      0.81      0.80       988
           1       0.69      0.78      0.73       782

    accuracy                           0.74      2001
   macro avg       0.71      0.62      0.64      2001
weighted avg       0.73      0.74      0.73      2001



In [42]:
# model accuracy score
accuracy = accuracy_score(y_test, y_pred_val)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 73.7631184407796


In [43]:
# Model deployment: predict sentiments of test data
y_pred_test = best_model.predict(X_val_tfidf)

In [44]:
# Save predictions to submission.csv
submission_df = pd.DataFrame({
    'tweet_id': df_test['tweet_id'],
    'label': y_pred_test
})

In [45]:
submission_df.to_csv('submission_optuna.csv', index=False)
print("Submission file saved successfully.")

Submission file saved successfully.
