In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, cross_validate
import time
from datetime import timedelta
from gensim.models import FastText
from gensim.models.keyedvectors import KeyedVectors
from sklearn.utils import shuffle

ModuleNotFoundError: No module named 'fasttext'

In [4]:
method = "GloVe" # GloVe FastText
CV = "CV"
dtype = "small"
stacking = "noStacking"

In [5]:
df = pd.read_csv('../data/processed/train_small.csv')
random_state = 42

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
print(df.head())

# OCHO MI DA ERRORE PER DEI NAN (causa pre-processing, la parte di cleaning)
df = df.dropna(subset=['tweet'])

# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Check the mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

X = df['tweet']
y = df['label']

if method=="BoW":
    vectorizer = CountVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(X)
    
elif method=="FastText":
    fasttext_model = FastText(sentences=X.apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4, sg=1)
    X = np.array([np.mean([fasttext_model.wv[word] for word in tweet.split() if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for tweet in X])
    
elif method=="GloVe":
    def load_glove_model(glove_file):
        model = {}
        with open(glove_file, 'r', encoding='utf-8') as f:
            for line in f:
                split_line = line.split()
                word = split_line[0]
                embedding = np.array([float(val) for val in split_line[1:]])
                model[word] = embedding
        return model

    glove_path = '../data/external/glove.twitter.27B/glove.twitter.27B.200d.txt'
    glove_model = load_glove_model(glove_path)

    def get_glove_embeddings(tweet, model, embedding_dim=200):
        words = tweet.split()
        embeddings = [model[word] for word in words if word in model]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(embedding_dim)

    X = np.array([get_glove_embeddings(tweet, glove_model) for tweet in X])
    
else:
    raise ValueError("Method not allowed.")



                                               tweet     label
0                        so nice out but so fkn cold  negative
1  okay i hope you will not fool me , as did one ...  positive
2  assassin's creed limited edition ( video game ...  negative
3    lol qt lmao . hell we can start our day early !  positive
4  me and my friend think you look like stacy sol...  positive
{'negative': 0, 'positive': 1}


In [6]:
models = [
    LogisticRegression(random_state=random_state),
    LinearSVC(random_state=random_state),
    RidgeClassifier(random_state=random_state),
    SGDClassifier(random_state=random_state),
    RandomForestClassifier(random_state=random_state, n_jobs=-1),
    ExtraTreesClassifier(random_state=random_state),
    MLPClassifier(verbose=False, random_state=random_state),
]

          
models_names = [
    'Logistic Regression',
    'Support Vector Machine',
    'Ridge Classifier',
    'SGD Classifier',
    'Random Forest',
    'Extra Trees',
    'Multi Layer Perceptron',
]

models_hparams = [
    # LogisticRegression
    {'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'saga']},
    
    # LinearSVC
    {'C': [0.01, 0.1, 1, 10], 'loss': ['hinge', 'squared_hinge']},
    
    # RidgeClassifier
    {'alpha': [0.01, 0.1, 1, 10]},
    
    # SGDClassifier
    {'loss': ['hinge', 'log'], 'alpha': [0.001, 0.01, 0.1], 'penalty': ['l2', 'l1', 'elasticnet']},
    
    # RandomForestClassifier
    {'n_estimators': [50, 100, 200], 'min_samples_split': [2, 5, 10], 'criterion': ['gini']},
    
    # ExtraTreesClassifier
    {'n_estimators': [50, 100, 200], 'min_samples_split': [2, 5, 10], 'criterion': ['gini']},
    
    # MLPClassifier
    {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu'], 'solver': ['adam'], 'alpha': [0.001, 0.01, 1]}
]

In [7]:
chosen_hparams = list()
estimators = list()
results = list()
for model, model_name, hparams in zip(models, models_names, models_hparams):
    
        print("\n########       {}       ########".format(model_name))
        starting_time = time.time()
        clf = GridSearchCV(estimator=model, param_grid=hparams, scoring='accuracy', cv=5)
        clf.fit(X, y)
        ending_time = time.time()
        chosen_hparams.append(clf.best_params_)
        estimators.append((model_name, clf.best_score_, clf.best_estimator_))
        
        for hparam in hparams:
            print(f'\t--> best value for hyperparameter "{hparam}": ', clf.best_params_.get(hparam))
        
        accuracy = clf.cv_results_['mean_test_score'][clf.best_index_]
        std_ = clf.cv_results_['std_test_score'][clf.best_index_]
        
        # Save models with repsective accuracy for further ensemble
        results.append((model_name, model, accuracy, std_))
    
        print(f'\t--> best model mean accuracy:', accuracy)
        print(f'\t--> best model std:', std_)
        print(f'\tElapsed time for GridSearch: ', timedelta(seconds=ending_time - starting_time))


########       Logistic Regression       ########


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	--> best value for hyperparameter "C":  10
	--> best value for hyperparameter "solver":  saga
	--> best model mean accuracy: 0.787124356217811
	--> best model std: 0.0025922619399661086
	Elapsed time for GridSearch:  0:04:55.143755

########       Support Vector Machine       ########




KeyboardInterrupt: 

In [None]:
# Save models with repsective accuracy for further ensemble
results.append((model_name, model, accuracy, std_))
    
# Sort the models by accuracy
results.sort(key=lambda x: x[2], reverse=True)
# Get the top 3 models
top_3_models = results[:3]
print("\nTop 3 Models:")
for model_name, model, accuracy in top_3_models:
    print(f"{model_name}: Accuracy = {accuracy}")

In [35]:
from scipy.sparse import vstack

# Find the best model based on accuracy
best_model_name, best_model, best_accuracy = max(results, key=lambda x: x[2])

print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")

# Combine the training and validation sets for final training
X_full = vstack([X_train, X_val])
y_full = np.concatenate((y_train, y_val))

# Final training on entire data
best_model.fit(X_full, y_full)

# Save the trained model to disk
import joblib
joblib.dump(best_model, f'../models/{method}_classifiers_CV_noStacking_small.pkl')


Best Model: Extra Trees with accuracy 0.815840792039602


['../models/BoW_classifiers_noCV_noStacking_small.pkl']

In [36]:
# Test set
df_test = pd.read_csv('../data/processed/test.csv')

model_path = f'../models/BoW_classifiers_CV_noStacking_small.pkl'

best_model = joblib.load(model_path)
X_test = df_test['tweet']

# Transform the test set using the same vectorizer and make predictions
if method == "BoW":
    X_test_vec = vectorizer.transform(X_test)
elif method == "FastText":
    X_test_vec = np.array([np.mean([fasttext_model.wv[word] for word in tweet.split() if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for tweet in X_test])
elif method == "GloVe":
    X_test_vec = np.array([get_glove_embeddings(tweet, glove_model) for tweet in X_test])

y_test_pred = best_model.predict(X_test_vec)

# Create the final DataFrame with Id and Prediction columns
df_test['prediction'] = y_test_pred
df_test['prediction'] = df_test['prediction'].replace(0, -1)
df_final = df_test[['id', 'prediction']]
df_final = df_final.rename(columns={'id': 'Id', 'prediction': 'Prediction'})

# Save the final DataFrame to a CSV file
df_final.to_csv('../results/submission_classifiers_noCV_noStacking_small.csv', index=False)

# Print the first few rows of the final DataFrame
print(df_final.head())

   Id  Prediction
0   1          -1
1   2          -1
2   3          -1
3   4           1
4   5          -1


In [None]:
models_hparams = [
    # LogisticRegression
    {'C': [0.5, 1, 10], 'solver': ['lbfgs', 'saga']},
    
    # LinearSVC
    {'C': [0.01, 0.1, 1, 10], 'loss': ['hinge', 'squared_hinge']},
    
    # RidgeClassifier
    {'alpha': [0.01, 0.1, 1, 10]},
    
    # SGDClassifier
    {'loss': ['hinge', 'log'], 'alpha': [0.001, 0.01, 0.1], 'penalty': ['l2', 'l1', 'elasticnet']},
    
    # RandomForestClassifier
    {'n_estimators': [50, 100, 200], 'min_samples_split': [2, 5, 10], 'criterion': ['gini']},
    
    # ExtraTreesClassifier
    {'n_estimators': [50, 100, 200], 'min_samples_split': [2, 5, 10], 'criterion': ['gini']},
    
    # MLPClassifier
    {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu'], 'solver': ['adam'], 'alpha': [0.001, 0.01, 1]}
]