In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_validate
import time
from datetime import timedelta
from scipy.sparse import vstack
import joblib 

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

random_state = 42

In [27]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def tweet_to_glove_vector(tweet, embeddings, vector_size=200):
    words = tweet.lower().split()
    tweet_vec = np.zeros(vector_size)
    count = 0
    for word in words:
        if word in embeddings:
            tweet_vec += embeddings[word]
            count += 1
    if count != 0:
        tweet_vec /= count
    return tweet_vec

In [28]:
method = "GloVe" # BoW GloVe
input_path = '../data/processed/train_small.csv'
glove_path = '../data/external/glove.twitter.27B/glove.twitter.27B.200d.txt'
hparams_tuning = True

In [29]:
df = pd.read_csv(input_path)

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
print(df.head())

# OCHO MI DA ERRORE PER DEI NAN (causa pre-processing, la parte di cleaning)
df = df.dropna(subset=['tweet'])

# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Check the mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

X = df['tweet']
y = df['label']

if hparams_tuning == False:
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=random_state)

    if method=="BoW":
        vectorizer = CountVectorizer(max_features=5000)
        X_train = vectorizer.fit_transform(X_train)
        X_val = vectorizer.transform(X_val)
        
    elif method=="GloVe":
        glove_embeddings  = load_glove_embeddings(glove_path)
        print(f"Loaded {len(glove_embeddings )} word vectors.")
        X_train = np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X_train])
        X_val= np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X_val])
else:
    if method=="BoW":
        vectorizer = CountVectorizer(max_features=5000)
        X = vectorizer.fit_transform(X)
        
    elif method=="GloVe":
        glove_embeddings  = load_glove_embeddings(glove_path)
        print(f"Loaded {len(glove_embeddings )} word vectors.")
        X = np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X])
    

                                               tweet     label
0                        so nice out but so fkn cold  negative
1  okay i hope you will not fool me , as did one ...  positive
2  assassin's creed limited edition ( video game ...  negative
3    lol qt lmao . hell we can start our day early !  positive
4  me and my friend think you look like stacy sol...  positive
{'negative': 0, 'positive': 1}
Loaded 1193514 word vectors.


In [30]:
models = [
    LogisticRegression(random_state=random_state),
    LinearSVC(random_state=random_state),
    RidgeClassifier(random_state=random_state),
    SGDClassifier(random_state=random_state),
    ExtraTreesClassifier(random_state=random_state),
    MLPClassifier(verbose=False, random_state=random_state),
]

         
models_names = [
    'Logistic Regression',
    'Support Vector Machine',
    'Ridge Classifier',
    'SGD Classifier',
    'Extra Trees',
    'Multi Layer Perceptron',
]

In [31]:
if hparams_tuning == True:
    models_hparams = [
    # LogisticRegression
    {'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'saga']},
    
    # LinearSVC
    {'C': [0.01, 0.1, 1, 10], 'loss': ['hinge', 'squared_hinge']},
    
    # RidgeClassifier
    {'alpha': [0.01, 0.1, 1, 10]},
    
    # SGDClassifier
    {'loss': ['hinge', 'log'], 'alpha': [0.001, 0.01, 0.1], 'penalty': ['l2', 'l1', 'elasticnet']},
    
    # ExtraTreesClassifier
    #{'n_estimators': [50, 100, 200], 'min_samples_split': [2, 5, 10], 'criterion': ['gini']},
    
    # MLPClassifier
    #{'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu'], 'solver': ['adam'], 'alpha': [0.001, 0.01, 1]}
    ]
    chosen_hparams = list()
    estimators = list()
    results = list()
    for model, model_name, hparams in zip(models, models_names, models_hparams):
        
            print("\n########       {}       ########".format(model_name))
            starting_time = time.time()
            clf = GridSearchCV(estimator=model, param_grid=hparams, scoring='accuracy', cv=5)
            clf.fit(X_train, y_train)
            ending_time = time.time()
            chosen_hparams.append(clf.best_params_)
            estimators.append((model_name, clf.best_score_, clf.best_estimator_))
            
            for hparam in hparams:
                print(f'\t--> best value for hyperparameter "{hparam}": ', clf.best_params_.get(hparam))
            
            mean_accuracy = clf.cv_results_['mean_test_score'][clf.best_index_]
            std_score = clf.cv_results_['std_test_score'][clf.best_index_]
            
            # Save models with repsective accuracy
            results.append((model_name, model, mean_accuracy, std_score))
        
            print(f'\t--> best model mean accuracy:', mean_accuracy)
            print(f'\t--> best model std:', std_score)
            print(f'\tElapsed time for GridSearch: ', timedelta(seconds=ending_time - starting_time))
            
    # Find the best model based on accuracy
    best_model_name, best_model, best_accuracy, _ = max(results, key=lambda x: x[2])
    print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")
            
else:
    results = list()
    for model, model_name in zip(models, models_names):
        print(f"\n########       {model_name}       ########")
        starting_time = time.time()
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        ending_time = time.time()
        print(f'Elapsed time: {timedelta(seconds=ending_time - starting_time)}')
        
        # Calculate accuracy
        accuracy = accuracy_score(y_val, y_pred)
        print(f'Accuracy: {accuracy:.4f}')
        
        # Save models with repsective accuracy
        results.append((model_name, model, accuracy))
        
    # Find the best model based on accuracy
    best_model_name, best_model, best_accuracy = max(results, key=lambda x: x[2])
    print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")


########       Logistic Regression       ########
	--> best value for hyperparameter "C":  1
	--> best value for hyperparameter "solver":  saga
	--> best model mean accuracy: 0.786856003915672
	--> best model std: 0.0014278692941728879
	Elapsed time for GridSearch:  0:03:57.999010

########       Support Vector Machine       ########


KeyboardInterrupt: 

In [34]:
# Final training based on best model:
if hparams_tuning == False:
    # Combine the training and validation sets for final training
    X = vstack([X_train, X_val])
    y = np.concatenate((y_train, y_val))

# Final training on entire data
best_model.fit(X, y)

# Save the trained model to disk
joblib.dump(best_model, f'../models/model.pkl')

['../models/model.pkl']

In [10]:
# Test set
df_test = pd.read_csv('../data/processed/test.csv')


best_model = joblib.load('../models/model.pkl')
X_test = df_test['tweet']

# Transform the test set using the same vectorizer and make predictions
if method == "BoW":
    X_test_vec = vectorizer.transform(X_test)
elif method == "GloVe":
    X_test_vec= np.array([tweet_to_glove_vector(tweet, glove_embeddings) for tweet in X_test])

y_test_pred = best_model.predict(X_test_vec)

# Create the final DataFrame with Id and Prediction columns
df_test['prediction'] = y_test_pred
df_test['prediction'] = df_test['prediction'].replace(0, -1)
df_final = df_test[['id', 'prediction']]
df_final = df_final.rename(columns={'id': 'Id', 'prediction': 'Prediction'})

# Save the final DataFrame to a CSV file
df_final.to_csv(f'../results/predictions.csv', index=False)

# Print the first few rows of the final DataFrame
print(df_final.head())

   Id  Prediction
0   1          -1
1   2          -1
2   3          -1
3   4           1
4   5          -1


# OLD

In [None]:
# Instantiate the Stacking Classifier with the top 3 weak learners
top_weak_learners = [(model_name, model) for model_name, model, _ in top_2_models]
clf_stack = StackingClassifier(estimators = top_weak_learners, final_estimator = LogisticRegression())

# Fit the StackingClassifier on the training data
starting_time = time.time()
clf_stack.fit(X_train, y_train)
ending_time = time.time()
print(f'Elapsed time: {timedelta(seconds=ending_time - starting_time)}')
# Predict on validation set
y_pred_stack = clf_stack.predict(X_val)
# Calculate accuracy
accuracy_stack = accuracy_score(y_val, y_pred_stack)
# Add the StackingClassifier result
results.append(('Stacking Classifier', clf_stack, accuracy_stack))
print(f'\nStacking Classifier Accuracy: {accuracy_stack}')

In [None]:
# Find the best model based on accuracy
best_model_name, best_model, best_accuracy = max(results, key=lambda x: x[2])

print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")

# Final training on entire data
best_model.fit(X, y)

# Save the trained model to disk
import joblib
joblib.dump(best_model, f'{method}_best_model.pkl')

### GridSearchCV

In [None]:
'''
models = [
    LogisticRegression(random_state=random_state),
    LinearSVC(random_state=random_state),
    #KNeighborsClassifier(n_jobs=-1),
    MLPClassifier(verbose=False, random_state=random_state),
    RandomForestClassifier(random_state=random_state, n_jobs=-1),
    GradientBoostingClassifier(random_state=random_state),
    AdaBoostClassifier(random_state=random_state),
    ExtraTreesClassifier(random_state=random_state),
    RidgeClassifier(random_state=random_state),
    SGDClassifier(random_state=random_state),
    GaussianNB(),
    XGBClassifier(random_state=random_state, use_label_encoder=False, eval_metric='logloss'),
    LGBMClassifier(random_state=random_state)
]

          
models_names = [
    'Logistic Regression',
    'Support Vector Machine',
    #'K Nearest Neighbors',
    'Multi Layer Perceptron',
    'Random Forest',
    'Gradient Boosting',
    'AdaBoost',
    'Extra Trees',
    'Ridge Classifier',
    'SGD Classifier',
    'Gaussian Naive Bayes',
    'XGBoost',
    'LightGBM'
]


models_hparams = [
    {'solver': ['lbfgs'], 'penalty': ['l2'], 'C': [1e7], 'max_iter':[1000]},
    #{'solver': ['saga'], 'penalty': ['l2'], 'C': [9e-2], 'fit_intercept':[True]},
    
    {'penalty': ['l2'], 'C': [3.75e-2], 'fit_intercept':[True]},
    
    #{'n_neighbors': [10], 'weights':['uniform']},
    
    {'hidden_layer_sizes': [(20)], 'max_iter': [100], 'activation': ['relu'], 'solver': ['adam'], 'alpha': [1]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'learning_rate': [0.01, 0.1, 0.2, 0.3], 'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5, 7, 10]},
    
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'alpha': [0.1, 1.0, 10.0], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']},
    
    {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01, 0.1]},
    
    {},  # GaussianNB has no hyperparameters to tune

    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5, 7, 10]},
    
    {'num_leaves': [31, 50, 100], 'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [50, 100, 200], 'max_depth': [-1, 3, 5, 7, 10]}
]
'''


In [None]:
'''chosen_hparams = list()
estimators = list()

for model, model_name, hparams in zip(models, models_names, models_hparams):
    
        print("\n########       {}       ########".format(model_name))
        starting_time = time.time()
        clf = GridSearchCV(estimator=model, param_grid=hparams, scoring='accuracy', cv=5)
        clf.fit(X_train, y_train)
        ending_time = time.time()
        chosen_hparams.append(clf.best_params_)
        estimators.append((model_name, clf.best_score_, clf.best_estimator_))
        
        for hparam in hparams:
            print(f'\t--> best value for hyperparameter "{hparam}": ', clf.best_params_.get(hparam))
        
        mean_test_score = clf.cv_results_['mean_test_score'][clf.best_index_]
        std_test_score = clf.cv_results_['std_test_score'][clf.best_index_]
    
        print(f'\t--> best model mean accuracy:', mean_test_score)
        print(f'\t--> best model std:', std_test_score)
        print(f'\tElapsed time for GridSearch: ', timedelta(seconds=ending_time - starting_time))
'''

In [None]:
### Ensemble: Stacking Classifier with top 3 weak learners

# Sort estimators by the accuracy metric
estimators.sort(key=lambda i:i[1],reverse=True)

# Get the top 3 classifiers by their accuracy metric
top3_clfs = list()
for clf in estimators[0:3]:
    top3_clfs.append((clf[0], clf[2]))
    
# Instantiate the Stacking Classifier with the top 3 weak learners
clf_stack = StackingClassifier(estimators = top3_clfs, final_estimator = LogisticRegression())

In [None]:
perf_eval_estimators = list()
for model_tuple in estimators:
    model_name = model_tuple[0]
    model = model_tuple[2]
    scores = cross_validate(model, X_train, y_train, cv=5, scoring=('accuracy'))
    print('\n')
    print('The cross-validated Accuracy of {} is: '.format(model_name), np.mean(scores['test_score']))
    perf_eval_estimators.append((model_name, np.mean(scores['test_score']), model))

# Cross Validation for Stacking Ensemble
scores = cross_validate(clf_stack, X_train, y_train, cv=5, scoring=('accuracy'))
print('\n')
print('The cross-validated Accuracy of Stacking Model is ', np.mean(scores['test_score']))

perf_eval_estimators.append( ('Stacking Classifier', np.mean(scores['test_score']), clf_stack) )

perf_eval_estimators.sort(key=lambda i:i[1],reverse=True)
final_model = perf_eval_estimators[0][2]
final_model_accuracy = perf_eval_estimators[0][1]
final_model_name = perf_eval_estimators[0][0]
print("\n######## The Final Model selected is: ########")
print(final_model_name)
print('The cross-validated Accuracy is: ', final_model_accuracy)

In [None]:
# Final training
final_model.fit(X_train, y_train)

In [None]:
# Tested model hparams
models_hparams = [
    {'solver': ['liblinear', 'saga'], 'penalty': ['l1', 'l2'], 'C': [1e-5, 5e-5, 1e-4, 5e-4, 1], 'fit_intercept':[True, False]},
    
    {'C': [1e-4, 1e-2, 1, 1e1, 1e2], 'gamma': ['scale', 1e-2, 1e-3, 1e-4, 1e-5], 'kernel': ['linear', 'rbf']},
    
    {'n_neighbors': list(range(1, 10, 2))},
    
    {'max_depth': [3, 4, 5, 7, 10], 'criterion': ['gini', 'entropy']},
    
    {'hidden_layer_sizes': [(40), (50), (80)], 'max_iter': [100],
     'activation': ['logistic', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'alpha': [1e-4, 1e-2, 1, 1e1, 1e2]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'learning_rate': [0.01, 0.1, 0.2, 0.3], 'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5, 7, 10]},
    
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]},
    
    {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    
    {'alpha': [0.1, 1.0, 10.0], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']},
    
    {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01, 0.1]},
    
    {},  # GaussianNB has no hyperparameters to tune
    
    {'radius': [1.0, 1.5, 2.0, 2.5, 3.0], 'weights': ['uniform', 'distance']},
    
    {'reg_param': [0.0, 0.1, 0.5, 1.0], 'tol': [0.0001, 0.001, 0.01, 0.1]},
    
    {'solver': ['svd', 'lsqr', 'eigen'], 'shrinkage': ['auto', None, 0.1, 0.5, 1.0]},
    
    {'learning_rate': [0.01, 0.1, 0.2], 'max_iter': [100, 200], 'max_leaf_nodes': [31, 127, 255], 'max_depth': [None, 3, 5, 7, 10]}
]