# Notebook to test BoW and GloVe methods

#### Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_validate
import time
from datetime import timedelta
from scipy.sparse import vstack
import joblib 

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

#### Config

In [None]:
random_state = 42
method = "BoW" # BoW or GloVe
input_path = '../data/processed/train_full_policy5.csv'
glove_path = '../src/models/glove.twitter.27B/glove.twitter.27B.200d.txt'
hparams_tuning = False # True or False
train_type = "full" # full or val

#### Functions

In [None]:
def load_glove_embeddings(file_path):
    """
    Loads GloVe embeddings from a file and returns a dictionary of word vectors.

    Parameters
    ----------
    file_path : str
        The path to the GloVe embeddings file.

    Returns
    -------
    dict
        A dictionary where the keys are words and the values are the corresponding word vectors.
    """
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def tweet_to_glove_vector(tweet, embeddings, vector_size=200):
    """
    Converts a tweet to a GloVe vector by averaging the vectors of the words in the tweet.

    Parameters
    ----------
    tweet : str
        The tweet to be converted into a vector.
    embeddings : dict
        A dictionary of word vectors.
    vector_size : int, optional
        The size of the word vectors, by default 200.

    Returns
    -------
    np.ndarray
        A numpy array representing the averaged vector of the tweet.
    """
    words = tweet.lower().split()
    tweet_vec = np.zeros(vector_size)
    count = 0
    for word in words:
        if word in embeddings:
            tweet_vec += embeddings[word]
            count += 1
    if count != 0:
        tweet_vec /= count
    return tweet_vec

### Pipeline

#### 1. Embeddings Generation

In [None]:
df = pd.read_csv(input_path)

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
print(df.head())

# Remove NaNs due to pre-processing pipeline
df = df.dropna(subset=['tweet'])

# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Check the mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

X = df['tweet']
y = df['label']

if hparams_tuning == False:
    if train_type == "val":
        # Split data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=random_state)

        if method=="BoW":
            # Bag of Words embeddings
            vectorizer = CountVectorizer(max_features=5000)
            X_train = vectorizer.fit_transform(X_train)
            X_val = vectorizer.transform(X_val)
            
        elif method=="GloVe":
            # GloVe embeddings
            glove_embeddings  = load_glove_embeddings(glove_path)
            print(f"Loaded {len(glove_embeddings )} word vectors.")
            X_train = np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X_train])
            X_val= np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X_val])
            
    elif train_type == "full":
        if method=="BoW":
            # Bag of Words embeddings
            vectorizer = CountVectorizer(max_features=5000)
            X = vectorizer.fit_transform(X)
            
        elif method=="GloVe":
            # GloVe embeddings
            glove_embeddings  = load_glove_embeddings(glove_path)
            print(f"Loaded {len(glove_embeddings )} word vectors.")
            X = np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X])
        
else:
    # No splitting into X_train and X_val as we are going to perform hparams tuning with k-fold cross-validation
    if method=="BoW":
        vectorizer = CountVectorizer(max_features=5000)
        X = vectorizer.fit_transform(X)
        
    elif method=="GloVe":
        glove_embeddings  = load_glove_embeddings(glove_path)
        print(f"Loaded {len(glove_embeddings )} word vectors.")
        X = np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X])
    

#### 2. Training

In [None]:
if hparams_tuning == True:
    # Hyper-parameters tuning with k-fold cross-validation using GridSearchCV
    
    # Models definition
    models = [
    LogisticRegression(random_state=random_state),
    LinearSVC(random_state=random_state),
    RidgeClassifier(random_state=random_state),
    SGDClassifier(random_state=random_state),
    ExtraTreesClassifier(random_state=random_state),
    MLPClassifier(verbose=False, random_state=random_state)
    ]
    models_names = [
        'Logistic Regression',
        'Support Vector Machine',
        'Ridge Classifier',
        'SGD Classifier',
        'Extra Trees',
        'Multi Layer Perceptron',
    ]
    
    # Models hparams
    models_hparams = [
    # LogisticRegression
    {'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'saga']},
    
    # LinearSVC
    {'C': [0.01, 0.1, 1, 10], 'loss': ['hinge', 'squared_hinge']},
    
    # RidgeClassifier
    {'alpha': [0.01, 0.1, 1, 10]},
    
    # SGDClassifier
    {'loss': ['hinge', 'log_loss'], 'alpha': [0.0001, 0.01, 0.1], 'penalty': ['l2', 'l1', 'elasticnet']},
    
    # ExtraTreesClassifier
    {'n_estimators': [50, 100], 'min_samples_split': [2, 5, 10], 'criterion': ['gini']},
    
    # MLPClassifier
    {'hidden_layer_sizes': [(50), (100), (50, 50)], 'activation': ['relu'], 'solver': ['adam'], 'alpha': [0.001, 0.01, 1]}
    ]
    
    chosen_hparams = list()
    estimators = list()
    results = list()
    
    # Training loop
    for model, model_name, hparams in zip(models, models_names, models_hparams):
        
            print("\n########       {}       ########".format(model_name))
            starting_time = time.time()
            clf = GridSearchCV(estimator=model, param_grid=hparams, scoring='accuracy', cv=5)
            clf.fit(X, y)
            ending_time = time.time()
            chosen_hparams.append(clf.best_params_)
            estimators.append((model_name, clf.best_score_, clf.best_estimator_))
            
            for hparam in hparams:
                print(f'\t--> best value for hyperparameter "{hparam}": ', clf.best_params_.get(hparam))
            
            mean_accuracy = clf.cv_results_['mean_test_score'][clf.best_index_]
            std_score = clf.cv_results_['std_test_score'][clf.best_index_]
            
            # Save models with repsective accuracy
            results.append((model_name, model, mean_accuracy, std_score))
        
            print(f'\t--> best model mean accuracy:', mean_accuracy)
            print(f'\t--> best model std:', std_score)
            print(f'\tElapsed time for GridSearch: ', timedelta(seconds=ending_time - starting_time))
            
    # Find the best model based on accuracy
    best_model_name, best_model, best_accuracy, _ = max(results, key=lambda x: x[2])
    print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")
            
elif train_type == "val":
    # Training models with best hyper-parameters
    
    # Models definition
    models = [
    LogisticRegression(random_state=random_state, C=1, solver='saga'),
    LinearSVC(random_state=random_state, C=0.1, loss='squared_hinge'),
    RidgeClassifier(random_state=random_state, alpha=10),
    SGDClassifier(random_state=random_state, loss='hinge', alpha=0.0001, penalty='l2'),
    ExtraTreesClassifier(random_state=random_state, n_estimators=100, min_samples_split=5, criterion='gini'),
    MLPClassifier(verbose=False, random_state=random_state, hidden_layer_sizes=(100), activation='relu', solver='adam', alpha=0.001)
    ]
    models_names = [
        'Logistic Regression',
        'Support Vector Machine',
        'Ridge Classifier',
        'SGD Classifier',
        'Extra Trees',
        'Multi Layer Perceptron',
    ]

    results = list()
    
    # Training loop
    for model, model_name in zip(models, models_names):
        print(f"\n########       {model_name}       ########")
        starting_time = time.time()
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        ending_time = time.time()
        print(f'Elapsed time: {timedelta(seconds=ending_time - starting_time)}')
        
        # Calculate accuracy
        accuracy = accuracy_score(y_val, y_pred)
        print(f'Accuracy: {accuracy:.4f}')
        
        # Save models with repsective accuracy
        results.append((model_name, model, accuracy))
        
    # Find the best model based on accuracy
    best_model_name, best_model, best_accuracy = max(results, key=lambda x: x[2])
    print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")

    # Combine the training and validation sets for final training
    X = vstack([X_train, X_val])
    y = np.concatenate((y_train, y_val))
    
    # Final training on entire data
    best_model.fit(X, y)
    # Save the trained best model to disk
    joblib.dump(model, f'../models/model_{method}_{best_model_name}.pkl')
    
elif train_type == "full":
    # Models definition
    models = [
    LogisticRegression(random_state=random_state, C=1, solver='saga'),
    LinearSVC(random_state=random_state, C=0.1, loss='squared_hinge'),
    RidgeClassifier(random_state=random_state, alpha=10),
    SGDClassifier(random_state=random_state, loss='hinge', alpha=0.0001, penalty='l2'),
    ExtraTreesClassifier(random_state=random_state, n_estimators=100, min_samples_split=5, criterion='gini'),
    MLPClassifier(verbose=False, random_state=random_state, hidden_layer_sizes=(100), activation='relu', solver='adam', alpha=0.001)
    ]
    models_names = [
        'Logistic_Regression',
        'Support_Vector_Machine',
        'Ridge_Classifier',
        'SGD_Classifier',
        'Extra_Trees',
        'Multi_Layer_Perceptron',
    ]

    # Final training on entire data
    for model, model_name in zip(models, models_names):
        print(f"\n########       Full training: {model_name}       ########")
        starting_time = time.time()
        model.fit(X, y)
        ending_time = time.time()
        print(f'Elapsed time: {timedelta(seconds=ending_time - starting_time)}')
        
        # Save the trained model to disk
        joblib.dump(model, f'../models/model_{method}_{model_name}.pkl')

#### 3. Make Predictions on Test Set

In [None]:
# Test set
df_test = pd.read_csv('../data/processed/test.csv')

best_model = joblib.load('../models/model.pkl')
X_test = df_test['tweet']

# Transform the test set using the same vectorizer and make predictions
if method == "BoW":
    X_test_vec = vectorizer.transform(X_test)
elif method == "GloVe":
    X_test_vec= np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X_test])

y_test_pred = best_model.predict(X_test_vec)

# Create the final DataFrame with Id and Prediction columns
df_test['prediction'] = y_test_pred
df_test['prediction'] = df_test['prediction'].replace(0, -1)
df_final = df_test[['id', 'prediction']]
df_final = df_final.rename(columns={'id': 'Id', 'prediction': 'Prediction'})

# Save the final DataFrame to a CSV file
df_final.to_csv(f'../results/predictions_{method}.csv', index=False)
print(df_final.head())