# Importing Dependencies 

In [35]:
import pandas as pd  
import numpy as np 
import re  # regex to preprocess data 
import matplotlib.pyplot as plt  
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression  
from sklearn.naive_bayes import MultinomialNB  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Load Data

In [36]:
# load the training data
df = pd.read_csv('data/train.csv')

# load the testing data
testing = pd.read_csv('data/test.csv')

# Preprocessing Text

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@\S+', '', text)  # remove mentions
    text = re.sub(r'#\S+', '', text)  # remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters
    text = text.lower() 
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # remove stopwords
    return text

# clean the training data
df['cleaned_text'] = df['text'].apply(preprocess_text)

# clean the testing data
testing['cleaned_text'] = testing['text'].apply(preprocess_text)

# Preparing Data 

In [None]:
# use TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['target']

# split the data into training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare Models For Testing

In [None]:
# start the models for testing
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC()
}

# Evaluation 

In [None]:
# keep track of the best model and its F1 score
best_model = None
best_f1 = 0

# train the models and evaluate their performance 
for name, model in models.items():

    # training the model
    model.fit(X_train, y_train)
    
    # eval
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{name} F1 Score: {f1:.4f}")
    
    # update the best model if needed
    if f1 > best_f1:
        best_f1 = f1
        best_model = model

Logistic Regression F1 Score: 0.7310
Naive Bayes F1 Score: 0.7341
Random Forest F1 Score: 0.7155
SVM F1 Score: 0.7309


# Use Best Model on Test Data

In [None]:
# use the testing data now 
X_test_cleaned = vectorizer.transform(testing['cleaned_text'])
y_pred_test = best_model.predict(X_test_cleaned)

submission = pd.DataFrame({
    'id': testing['id'], 
    'target': y_pred_test
})

# save the output to the data folder as the 'test_submission'
submission.to_csv('data/test_submission.csv', index=False)