In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, cross_validate
import time
from datetime import timedelta
from gensim.models import FastText
from gensim.models.keyedvectors import KeyedVectors

In [None]:
method = "BoW" # BoW FastText

In [None]:
df = pd.read_csv('../data/processed/train.csv')
random_state = 42

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
print(df.head())

# OCHO MI DA ERRORE PER DEI NAN
df = df.dropna(subset=['tweet'])

# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Check the mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

X = df['tweet']
y = df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=random_state)

if method=="BoW":
    vectorizer = CountVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    
elif method=="FastText":
    fasttext_model = FastText(sentences=X_train.apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4, sg=1)
    X_train = np.array([np.mean([fasttext_model.wv[word] for word in tweet.split() if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for tweet in X_train])
    X_val = np.array([np.mean([fasttext_model.wv[word] for word in tweet.split() if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for tweet in X_val])
    
elif method=="GloVe":
    def load_glove_model(glove_file):
        model = {}
        with open(glove_file, 'r', encoding='utf-8') as f:
            for line in f:
                split_line = line.split()
                word = split_line[0]
                embedding = np.array([float(val) for val in split_line[1:]])
                model[word] = embedding
        return model

    glove_path = '../src/data/glove.twitter.27B/glove.twitter.27B.200d.txt'
    glove_model = load_glove_model(glove_path)

    def get_glove_embeddings(tweet, model, embedding_dim=200):
        words = tweet.split()
        embeddings = [model[word] for word in words if word in model]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(embedding_dim)

    X_train = np.array([get_glove_embeddings(tweet, glove_model) for tweet in X_train])
    X_val= np.array([get_glove_embeddings(tweet, glove_model) for tweet in X_val])
    
else:
    raise ValueError("Method not allowed.")



In [None]:
models = [
    LogisticRegression(random_state=random_state),
    LinearSVC(random_state=random_state),
    #KNeighborsClassifier(n_jobs=-1),
    MLPClassifier(verbose=False, random_state=random_state),
    RandomForestClassifier(random_state=random_state, n_jobs=-1),
    GradientBoostingClassifier(random_state=random_state),
    AdaBoostClassifier(random_state=random_state),
    ExtraTreesClassifier(random_state=random_state),
    RidgeClassifier(random_state=random_state),
    SGDClassifier(random_state=random_state),
    GaussianNB(),
    XGBClassifier(random_state=random_state, use_label_encoder=False, eval_metric='logloss'),
    LGBMClassifier(random_state=random_state)
]

          
models_names = [
    'Logistic Regression',
    'Support Vector Machine',
    #'K Nearest Neighbors',
    'Multi Layer Perceptron',
    'Random Forest',
    'Gradient Boosting',
    'AdaBoost',
    'Extra Trees',
    'Ridge Classifier',
    'SGD Classifier',
    'Gaussian Naive Bayes',
    'XGBoost',
    'LightGBM'
]

In [None]:
for model, model_name in zip(models, models_names):
    print(f"\n########       {model_name}       ########")
    starting_time = time.time()
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    print(f'Accuracy: {accuracy:.4f}')
    
    # Print classification report
    #print(classification_report(y_val, y_pred))
    
    ending_time = time.time()
    print(f'Elapsed time: {timedelta(seconds=ending_time - starting_time)}')

In [None]:
'''
models = [
    LogisticRegression(random_state=random_state),
    LinearSVC(random_state=random_state),
    #KNeighborsClassifier(n_jobs=-1),
    MLPClassifier(verbose=False, random_state=random_state),
    RandomForestClassifier(random_state=random_state, n_jobs=-1),
    GradientBoostingClassifier(random_state=random_state),
    AdaBoostClassifier(random_state=random_state),
    ExtraTreesClassifier(random_state=random_state),
    RidgeClassifier(random_state=random_state),
    SGDClassifier(random_state=random_state),
    GaussianNB(),
    XGBClassifier(random_state=random_state, use_label_encoder=False, eval_metric='logloss'),
    LGBMClassifier(random_state=random_state)
]

          
models_names = [
    'Logistic Regression',
    'Support Vector Machine',
    #'K Nearest Neighbors',
    'Multi Layer Perceptron',
    'Random Forest',
    'Gradient Boosting',
    'AdaBoost',
    'Extra Trees',
    'Ridge Classifier',
    'SGD Classifier',
    'Gaussian Naive Bayes',
    'XGBoost',
    'LightGBM'
]


models_hparams = [
    {'solver': ['lbfgs'], 'penalty': ['l2'], 'C': [1e7], 'max_iter':[1000]},
    #{'solver': ['saga'], 'penalty': ['l2'], 'C': [9e-2], 'fit_intercept':[True]},
    
    {'penalty': ['l2'], 'C': [3.75e-2], 'fit_intercept':[True]},
    
    #{'n_neighbors': [10], 'weights':['uniform']},
    
    {'hidden_layer_sizes': [(20)], 'max_iter': [100], 'activation': ['relu'], 'solver': ['adam'], 'alpha': [1]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'learning_rate': [0.01, 0.1, 0.2, 0.3], 'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5, 7, 10]},
    
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'alpha': [0.1, 1.0, 10.0], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']},
    
    {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01, 0.1]},
    
    {},  # GaussianNB has no hyperparameters to tune

    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5, 7, 10]},
    
    {'num_leaves': [31, 50, 100], 'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [50, 100, 200], 'max_depth': [-1, 3, 5, 7, 10]}
]
'''


In [None]:
'''chosen_hparams = list()
estimators = list()

for model, model_name, hparams in zip(models, models_names, models_hparams):
    
        print("\n########       {}       ########".format(model_name))
        starting_time = time.time()
        clf = GridSearchCV(estimator=model, param_grid=hparams, scoring='accuracy', cv=5)
        clf.fit(X_train, y_train)
        ending_time = time.time()
        chosen_hparams.append(clf.best_params_)
        estimators.append((model_name, clf.best_score_, clf.best_estimator_))
        
        for hparam in hparams:
            print(f'\t--> best value for hyperparameter "{hparam}": ', clf.best_params_.get(hparam))
        
        mean_test_score = clf.cv_results_['mean_test_score'][clf.best_index_]
        std_test_score = clf.cv_results_['std_test_score'][clf.best_index_]
    
        print(f'\t--> best model mean accuracy:', mean_test_score)
        print(f'\t--> best model std:', std_test_score)
        print(f'\tElapsed time for GridSearch: ', timedelta(seconds=ending_time - starting_time))
'''

In [None]:
### Ensemble: Stacking Classifier with top 3 weak learners

# Sort estimators by the accuracy metric
estimators.sort(key=lambda i:i[1],reverse=True)

# Get the top 3 classifiers by their accuracy metric
top3_clfs = list()
for clf in estimators[0:3]:
    top3_clfs.append((clf[0], clf[2]))
    
# Instantiate the Stacking Classifier with the top 3 weak learners
clf_stack = StackingClassifier(estimators = top3_clfs, final_estimator = LogisticRegression())

In [None]:
perf_eval_estimators = list()
for model_tuple in estimators:
    model_name = model_tuple[0]
    model = model_tuple[2]
    scores = cross_validate(model, X_train, y_train, cv=5, scoring=('accuracy'))
    print('\n')
    print('The cross-validated Accuracy of {} is: '.format(model_name), np.mean(scores['test_score']))
    perf_eval_estimators.append((model_name, np.mean(scores['test_score']), model))

# Cross Validation for Stacking Ensemble
scores = cross_validate(clf_stack, X_train, y_train, cv=5, scoring=('accuracy'))
print('\n')
print('The cross-validated Accuracy of Stacking Model is ', np.mean(scores['test_score']))

perf_eval_estimators.append( ('Stacking Classifier', np.mean(scores['test_score']), clf_stack) )

perf_eval_estimators.sort(key=lambda i:i[1],reverse=True)
final_model = perf_eval_estimators[0][2]
final_model_accuracy = perf_eval_estimators[0][1]
final_model_name = perf_eval_estimators[0][0]
print("\n######## The Final Model selected is: ########")
print(final_model_name)
print('The cross-validated Accuracy is: ', final_model_accuracy)

In [None]:
# Final training
final_model.fit(X_train, y_train)

In [None]:
# Tested model hparams
models_hparams = [
    {'solver': ['liblinear', 'saga'], 'penalty': ['l1', 'l2'], 'C': [1e-5, 5e-5, 1e-4, 5e-4, 1], 'fit_intercept':[True, False]},
    
    {'C': [1e-4, 1e-2, 1, 1e1, 1e2], 'gamma': ['scale', 1e-2, 1e-3, 1e-4, 1e-5], 'kernel': ['linear', 'rbf']},
    
    {'n_neighbors': list(range(1, 10, 2))},
    
    {'max_depth': [3, 4, 5, 7, 10], 'criterion': ['gini', 'entropy']},
    
    {'hidden_layer_sizes': [(40), (50), (80)], 'max_iter': [100],
     'activation': ['logistic', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'alpha': [1e-4, 1e-2, 1, 1e1, 1e2]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'learning_rate': [0.01, 0.1, 0.2, 0.3], 'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5, 7, 10]},
    
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'alpha': [0.1, 1.0, 10.0], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']},
    
    {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01, 0.1]},
    
    {},  # GaussianNB has no hyperparameters to tune
    
    {'radius': [1.0, 1.5, 2.0, 2.5, 3.0], 'weights': ['uniform', 'distance']},
    
    {'reg_param': [0.0, 0.1, 0.5, 1.0], 'tol': [0.0001, 0.001, 0.01, 0.1]},
    
    {'solver': ['svd', 'lsqr', 'eigen'], 'shrinkage': ['auto', None, 0.1, 0.5, 1.0]},
    
    {'learning_rate': [0.01, 0.1, 0.2], 'max_iter': [100, 200], 'max_leaf_nodes': [31, 127, 255], 'max_depth': [None, 3, 5, 7, 10]}
]