# Classificazione Binaria

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from time import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

colonne_da_ignorare = ['author', 'text', 'subreddit', 'time', 'upper_words', 'lemmatized_text', 'severe_toxicity', 'lista_emoji']
file_paths = ['File.csv'] #inserire il proprio file
dataframes = []
for file_path in file_paths:
    df = pd.read_csv(file_path, usecols=lambda col: col not in colonne_da_ignorare)
    dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)


In [None]:
# Assicurati che tutti i valori siano float
df['toxicity'] = pd.to_numeric(df['toxicity'], errors='coerce')
df.set_index('id', inplace=True)

In [None]:
# Creazione della variabile target binaria
toxicity_threshold = 0.5
df['toxicity_binary'] = (df['toxicity'] > toxicity_threshold).astype(int)

In [None]:
# Train_Test_Split
df = df.fillna(0)
colums_to_remove = ['toxicity_binary', 'toxicity']
X = df.drop(columns=colums_to_remove)
y = df['toxicity_binary']

X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Modelli per la classificazione binaria

models_binary_class = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, min_samples_split=10),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
}

pipelines = {name: Pipeline([
    ('scaler', StandardScaler()),
    ('model', model)
]) for name, model in models_binary_class.items()}


In [None]:
outfile_path = ''

In [None]:
# Funzione per salvare la matrice di confusione e il report di classificazione
def save_results(name, y_test, y_pred, is_multiclass):
    # Matrice di confusione
    unique_labels = np.unique(y_test)
    cm = confusion_matrix(y_test, y_pred, labels=unique_labels)
    cm_df = pd.DataFrame(cm, index=unique_labels, columns=unique_labels)
    cm_filename = f'{outfile_path}confusion_matrix_{name}.csv' # personalizza path per salvare
    cm_df.to_csv(cm_filename)

    # Report di classificazione
    cr = classification_report(y_test, y_pred, output_dict=True)
    cr_df = pd.DataFrame(cr).transpose()
    cr_filename = f'{outfile_path}classification_report_{name}.txt' # personalizza path per salvare
    with open(cr_filename, 'w') as f:
        f.write(classification_report(y_test, y_pred))



### Risultati

In [None]:
# Dizionario per salvare le predizioni
predictions_bin = {}
predictions_bin['real'] = y_test_binary

# Addestramento e valutazione dei modelli per la classificazione binaria
print("Classificazione Binaria:")
for name, pipeline in tqdm(pipelines.items(), desc="Modelli in esecuzione", unit="modello"):
    start_time = time()
    pipeline.fit(X_train, y_train_binary)
    y_pred = pipeline.predict(X_test)

    # Calcola il tempo di esecuzione per ogni modello
    elapsed_time = time() - start_time
    print(f"{name} completed in {elapsed_time:.2f} seconds")


    # Salvare le predizioni
    predictions_bin[name] = y_pred

    # Salva confusion matrix e classification report
    save_results(name, y_test_binary, y_pred, is_multiclass=False)

    # Stampa delle metriche di valutazione
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test_binary, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred))
    print("Classification Report:\n", classification_report(y_test_binary, y_pred))
    print("-" * 30)

df_predictions = pd.DataFrame(predictions_bin)

# Salva il DataFrame in un file CSV
df_predictions.to_csv(outfile_path + 'binary_classification_results.csv', index=True)


# Regressione lineare

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from time import time


In [None]:
# Suddivisione dei dati in set di addestramento e test
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, df['toxicity'], test_size=0.2, random_state=42)


In [None]:
# Definizione dei modelli di regressione
models_regression = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(alpha=0.1),
    'Ridge': Ridge(alpha=1.0),
    'Decision Tree Regressor': DecisionTreeRegressor(),
}


In [None]:
# Funzione per salvare le metriche di regressione
def save_regression_results(name, y_test, y_pred):
    # Calcolo delle metriche
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Calcolo dell'Adjusted R-squared
    n = len(y_test)
    p = X_train.shape[1]
    r2_adj = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    # Salvataggio delle metriche
    metrics_filename = f'regression_metrics_{name}.txt'
    with open(metrics_filename, 'w') as f:
        f.write(f'Mean Squared Error (MSE): {mse}\n')
        f.write(f'Mean Absolute Error (MAE): {mae}\n')
        f.write(f'Root Mean Squared Error (RMSE): {rmse}\n')
        f.write(f'Mean Absolute Percentage Error (MAPE): {mape}\n')
        f.write(f'R-squared (R²): {r2}\n')
        f.write(f'Adjusted R-squared (Adjusted R²): {r2_adj}\n')

    # Stampa dei risultati a video
    print(f"Risultati per il modello: {name}")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"  Mean Absolute Percentage Error (MAPE): {mape:.4f}")
    print(f"  R-squared (R²): {r2:.4f}")
    print(f"  Adjusted R-squared (Adjusted R²): {r2_adj:.4f}")
    print(f"\n\nSalvato: {metrics_filename}\n\n")
    print("-" * 40)





### Risultati

In [None]:
# Dizionario per salvare le predizioni
predictions_reg = {}
predictions_reg['real'] = y_test_reg

# Addestramento e valutazione dei modelli per la regressione
print("Regressione:")
for name, model in tqdm(models_regression.items(), desc="Modelli in esecuzione", unit="modello"):
    start_time = time()

    # Addestra il modello
    model.fit(X_train, y_train_reg)

    # Effettua le previsioni
    y_pred = model.predict(X_test)

    # Calcola il tempo di esecuzione per ogni modello
    elapsed_time = time() - start_time
    print(f"{name} completed in {elapsed_time:.2f} seconds")

    # Salvare le predizioni
    predictions_reg[name] = y_pred

    # Calcola le metriche di valutazione
    mse = mean_squared_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)

    # Stampa delle metriche di valutazione
    print(f"Model: {name}")
    print("Mean Squared Error (MSE):", mse)
    print("R^2 Score:", r2)
    print("-" * 30)

# Crea un DataFrame dalle predizioni
df_predictions = pd.DataFrame(predictions_reg)

# Salva il DataFrame in un file CSV
outfile_path = './'  # Cambia il percorso in base alle tue necessità
df_predictions.to_csv(outfile_path + 'regression_results.csv', index=True)


In [None]:
from time import time
import pandas as pd
import numpy as np
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_absolute_percentage_error,
)

# Dizionario per salvare le predizioni
predictions_reg = {}
predictions_reg['real'] = y_test_reg

# Addestramento e valutazione dei modelli per la regressione
print("Regressione:")
for name, model in tqdm(models_regression.items(), desc="Modelli in esecuzione", unit="modello"):
    start_time = time()

    # Addestra il modello
    model.fit(X_train, y_train_reg)

    # Effettua le previsioni
    y_pred = model.predict(X_test)

    # Calcola il tempo di esecuzione per ogni modello
    elapsed_time = time() - start_time
    print(f"{name} completed in {elapsed_time:.2f} seconds")

    # Salvare le predizioni
    predictions_reg[name] = y_pred

    # Calcola le metriche di valutazione
    mse = mean_squared_error(y_test_reg, y_pred)
    mae = mean_absolute_error(y_test_reg, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)

    # Calcolo dell'Adjusted R-squared
    n = len(y_test_reg)
    p = X_train.shape[1]
    r2_adj = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    # Stampa delle metriche di valutazione
    print(f"Risultati per il modello: {name}")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"  Mean Absolute Percentage Error (MAPE): {mape:.4f}")
    print(f"  R-squared (R²): {r2:.4f}")
    print(f"  Adjusted R-squared (Adjusted R²): {r2_adj:.4f}")
    print("-" * 40)

    # Salvataggio delle metriche in un file di testo
    metrics_filename = f'regression_metrics_{name}.txt'
    with open(metrics_filename, 'w') as f:
        f.write(f'Model: {name}\n')
        f.write(f'Mean Squared Error (MSE): {mse:.4f}\n')
        f.write(f'Mean Absolute Error (MAE): {mae:.4f}\n')
        f.write(f'Root Mean Squared Error (RMSE): {rmse:.4f}\n')
        f.write(f'Mean Absolute Percentage Error (MAPE): {mape:.4f}\n')
        f.write(f'R-squared (R²): {r2:.4f}\n')
        f.write(f'Adjusted R-squared (Adjusted R²): {r2_adj:.4f}\n')

    print(f"Salvato: {metrics_filename}\n")

# Crea un DataFrame dalle predizioni
df_predictions = pd.DataFrame(predictions_reg)

# Salva il DataFrame in un file CSV
outfile_path = './'  # Cambia il percorso in base alle tue necessità
df_predictions.to_csv(outfile_path + 'regression_results.csv', index=True)

print(f"Predizioni salvate in: {outfile_path + 'regression_results.csv'}")
