In [28]:
# from google.colab import drive

# drive.mount("/content/drive")
# import pandas as pd

# df = pd.read_csv(
#     "/content/drive/MyDrive/Maestría En Ciencia de Datos/Segundo Trimestre/MachineLearning1/FinalProject/data/classification.csv"
# )

In [29]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.decomposition import PCA

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    PrecisionRecallDisplay as PRDisplay,
)

import numpy as np

import warnings

warnings.filterwarnings("ignore")

In [30]:
df = pd.read_csv("../data/classification.csv")

In [None]:
display(df.head())

In [None]:
display(df.info())

In [None]:
display(df.describe().T)

In [None]:
null_values = df.isnull().sum()
print("Number of null values in each column:")
display(null_values[null_values > 0])

In [None]:
display(df.dtypes)
for column in df.columns:
    if df[column].dtype == "object":
        display(f"Unique values for categorical column '{column}':")
        display(df[column].unique())
    elif df[column].nunique() < 10:
        display(f"Unique values for potential categorical column '{column}':")
        display(df[column].unique())

In [None]:
df.plot(
    kind="hist",
    subplots=True,
    layout=(4, 4),
    bins=30,
    figsize=(16, 12),
    alpha=0.7,
    edgecolor="black",
)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.mixture import GaussianMixture
import numpy as np


def test_gmm_components(x, max_components=5):
    x = x.dropna().values.reshape(-1, 1)
    bics = []
    for k in range(1, max_components + 1):
        gmm = GaussianMixture(n_components=k, random_state=0).fit(x)
        bics.append(gmm.bic(x))
    best_k = np.argmin(bics) + 1
    return best_k, bics


gmm_results = {}

for col in df.columns:
    if col == "Y":
        continue
    best_k, bics = test_gmm_components(df[col])
    gmm_results[col] = {"optimal_components": best_k, "bics": bics}

for col, result in gmm_results.items():
    print(f"{col}: GMM with {result['optimal_components']} components")

In [None]:
sns.pairplot(
    df,
    hue="Y",
    kind="reg",
    diag_kind="kde",
    markers=["o", "s", "D"],
    palette="husl",
    height=2.5,
    plot_kws={"line_kws": {"color": "red"}, "scatter_kws": {"alpha": 0.6}},
    diag_kws={"shade": True},
)
plt.show()

In [None]:
plt.figure(figsize=(25, 15))
sns.heatmap(df[df.columns].corr(), annot=True, fmt="0.2f", cmap="YlGnBu")

In [None]:
from sklearn.impute import IterativeImputer
import pandas as pd

imputer = IterativeImputer(max_iter=10, random_state=0)

df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(df_imputed.isnull().sum())

In [None]:
X = df_imputed.drop("Y", axis=1)
y = df_imputed["Y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Logistic Regression with imputed data Accuracy: {accuracy}")

In [None]:
df_without_na = df.dropna()
X = df_without_na.drop("Y", axis=1)
y = df_without_na["Y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Logistic Regression without nulls Accuracy: {accuracy}")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logistic_model = LogisticRegression()

logistic_model.fit(X_train_scaled, y_train)

y_pred = logistic_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Logistic Regression without nulls and with scaled data: {accuracy}")

In [None]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train_scaled, y_train)
y_pred_lda = lda_model.predict(X_test)
accuracy_lda = accuracy_score(y_test, y_pred_lda)
print(f"Linear Discriminant Analysis Accuracy: {accuracy_lda}")

In [None]:
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train_scaled, y_train)
y_pred_qda = qda_model.predict(X_test)
accuracy_qda = accuracy_score(y_test, y_pred_qda)
print(f"Quadratic Discriminant Analysis Accuracy: {accuracy_qda}")

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K-Nearest Neighbors Classification Accuracy: {accuracy_knn}")

In [None]:
models = {
    "Logistic Regression": logistic_model,
    "LDA": lda_model,
    "QDA": qda_model,
    "KNN": knn_model,
}

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    PrecisionRecallDisplay,
)
import matplotlib.pyplot as plt

models = {
    "Logistic Regression": logistic_model,
    "LDA": lda_model,
    "QDA": qda_model,
    "KNN": knn_model,
}

for name, model in models.items():
    print(f"Evaluating {name}:")
    y_pred = model.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")
    print("-" * 30)

In [None]:
display(df.corr()["Y"].sort_values(ascending=False))

In [49]:
features = df.columns.drop("Y")

In [50]:
n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

In [None]:
plt.figure(figsize=(15, n_rows * 4))
for i, feature in enumerate(features):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(x="Y", y=feature, data=df)
    plt.title(f"Distribution of {feature} by Class")
plt.tight_layout()
plt.show()

In [52]:
high_corr_features = ["X6", "X7", "X2", "X13"]
n_high_corr = len(high_corr_features)
n_scatter_plots = n_high_corr * (n_high_corr - 1) // 2
n_cols_scatter = 3
n_rows_scatter = (n_scatter_plots + n_cols_scatter - 1) // n_cols_scatter

In [None]:
plt.figure(figsize=(15, n_rows_scatter * 4))
plot_index = 1
for i in range(n_high_corr):
    for j in range(i + 1, n_high_corr):
        feature1 = high_corr_features[i]
        feature2 = high_corr_features[j]
        plt.subplot(n_rows_scatter, n_cols_scatter, plot_index)
        sns.scatterplot(data=df, x=feature1, y=feature2, hue="Y", palette="viridis")
        plt.title(f"Scatter plot of {feature1} vs {feature2}")
        plot_index += 1
plt.tight_layout()
plt.show()

In [54]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)
df_pca = pd.DataFrame(data=pca_result, columns=["PC1", "PC2"])
df_pca["Y"] = y.reset_index(drop=True)

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_pca, x="PC1", y="PC2", hue="Y", palette="viridis")
plt.title("PCA 2 components")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.2f}%)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.2f}%)")
plt.show()

In [56]:
pca_3d = PCA(n_components=3)
pca_result_3d = pca_3d.fit_transform(X)
df_pca_3d = pd.DataFrame(data=pca_result_3d, columns=["PC1", "PC2", "PC3"])
df_pca_3d["Y"] = y.reset_index(drop=True)

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")
scatter = ax.scatter(
    df_pca_3d["PC1"],
    df_pca_3d["PC2"],
    df_pca_3d["PC3"],
    c=df_pca_3d["Y"],
    cmap="viridis",
)
ax.set_title("PCA of Classification Data (3 components)")
ax.set_xlabel(f"Principal Component 1 ({pca_3d.explained_variance_ratio_[0]:.2f}%)")
ax.set_ylabel(f"Principal Component 2 ({pca_3d.explained_variance_ratio_[1]:.2f}%)")
ax.set_zlabel(f"Principal Component 3 ({pca_3d.explained_variance_ratio_[2]:.2f}%)")
legend = ax.legend(*scatter.legend_elements(), title="Classes")
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_features = df_without_na.drop("Y", axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X_features.columns
vif_data["VIF"] = [
    variance_inflation_factor(X_features.values, i)
    for i in range(len(X_features.columns))
]

display(vif_data.sort_values(by="VIF", ascending=False))

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

X_features = df_without_na.drop("Y", axis=1).copy()
threshold = 5

while True:
    vif_data = pd.DataFrame(
        {
            "feature": X_features.columns,
            "VIF": [
                variance_inflation_factor(X_features.values, i)
                for i in range(X_features.shape[1])
            ],
        }
    ).sort_values("VIF", ascending=False)

    max_vif = vif_data["VIF"].iloc[0]
    drop_feature = vif_data["feature"].iloc[0]
    print(vif_data, "\n")

    if max_vif <= threshold:
        print("All VIF values are below the threshold!")
        break

    if drop_feature in X_features.columns:
        print(f"Dropping {drop_feature} with VIF = {max_vif:.2f}")
        X_features = X_features.drop(columns=[drop_feature])
    else:
        print(f"» Warning: {drop_feature} not found in X_features, skipping.")
        break

In [None]:
print(f"Final number of features: {X_features.shape[1]}")
print("Remaining features:", list(X_features.columns))

In [None]:
selected_features = X_features.columns
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

logistic_model = LogisticRegression()

logistic_model.fit(X_train_selected, y_train)

y_pred = logistic_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression with selected features Accuracy: {accuracy}")

In [None]:
pca = PCA()
X_pca = pca.fit_transform(X_features)

explained_variance = pca.explained_variance_ratio_
print("Explained variance ratio:", explained_variance)
print("Cumulative variance:", explained_variance.cumsum())

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_without_na["Y"])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA with Class Labels")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
plt.figure(figsize=(8, 6))
sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_selected, y_train)
y_pred = clf.predict(X_test_selected)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.tree import plot_tree

from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(
    clf,
    out_file=None,
    feature_names=X_features.columns,
    class_names=[str(c) for c in clf.classes_],
    filled=True,
    rounded=True,
    special_characters=True,
)

graph = graphviz.Source(dot_data)
graph.render("tree", format="png", cleanup=False)
graph.view()

plt.figure(figsize=(14, 8))
plot_tree(
    clf,
    filled=True,
    rounded=True,
    feature_names=X_train.columns,
    class_names=[str(c) for c in clf.classes_],
    fontsize=12,
)
plt.title("Decision Tree")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)

rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "bootstrap": [True, False],
}

grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1
)

grid_search.fit(X_train_selected, y_train)

best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")

print("Best parameters found:", grid_search.best_params_)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)

base_estimator = DecisionTreeClassifier(random_state=42)

ada = AdaBoostClassifier(estimator=base_estimator, random_state=42)

param_grid_ada = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 1.0],
    "estimator__max_depth": [1, 3, 5],
    "estimator__min_samples_split": [2, 5],
}

grid_search_ada = GridSearchCV(
    estimator=ada,
    param_grid=param_grid_ada,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

grid_search_ada.fit(X_train_selected, y_train)

best_ada = grid_search_ada.best_estimator_

y_pred = best_ada.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")

print("Best parameters found:", grid_search_ada.best_params_)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)

xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

param_grid_xgb = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

grid_search_xgb.fit(X_train_selected, y_train)

best_xgb = grid_search_xgb.best_estimator_

y_pred_xgb = best_xgb.predict(X_test_selected)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb, average="weighted")
f1_xgb = f1_score(y_test, y_pred_xgb, average="weighted")
precision_xgb = precision_score(y_test, y_pred_xgb, average="weighted")

print("Best parameters found for XGBoost:", grid_search_xgb.best_params_)
print(f"\nAccuracy: {accuracy_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1 Score: {f1_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

In [153]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

model = Sequential()
model.add(Dense(units=32, activation="relu", input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(units=len(set(y_train)), activation="softmax"))

model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

model.fit(
    X_train_scaled, y_train, epochs=20, batch_size=16, validation_split=0.2, verbose=1
)

loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nSingle-layer NN Accuracy: {accuracy:.4f}")

Epoch 1/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5584 - loss: 1.0258 - val_accuracy: 0.8403 - val_loss: 0.6798
Epoch 2/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9069 - loss: 0.6168 - val_accuracy: 0.9916 - val_loss: 0.3801
Epoch 3/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9989 - loss: 0.3238 - val_accuracy: 1.0000 - val_loss: 0.2004
Epoch 4/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 1.0000 - loss: 0.1730 - val_accuracy: 1.0000 - val_loss: 0.1122
Epoch 5/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 988us/step - accuracy: 1.0000 - loss: 0.0970 - val_accuracy: 1.0000 - val_loss: 0.0697
Epoch 6/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 911us/step - accuracy: 1.0000 - loss: 0.0628 - val_accuracy: 1.0000 - val_loss: 0.0470
Epoch 7/20
[1m60/60[0m [32m━━━━━━

In [154]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True, verbose=1
)

model_mlp = Sequential()
model_mlp.add(Dense(64, activation="relu", input_shape=(X_train_scaled.shape[1],)))
model_mlp.add(Dropout(0.3))
model_mlp.add(Dense(32, activation="relu"))
model_mlp.add(Dense(len(set(y_train)), activation="softmax"))

model_mlp.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

history = model_mlp.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    batch_size=16,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1,
)

loss_mlp, acc_mlp = model_mlp.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nMultilayer NN Accuracy: {acc_mlp:.4f}")

Epoch 1/100
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6471 - loss: 0.8760 - val_accuracy: 1.0000 - val_loss: 0.2365
Epoch 2/100
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 997us/step - accuracy: 1.0000 - loss: 0.1699 - val_accuracy: 1.0000 - val_loss: 0.0343
Epoch 3/100
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step - accuracy: 1.0000 - loss: 0.0390 - val_accuracy: 1.0000 - val_loss: 0.0115
Epoch 4/100
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 973us/step - accuracy: 1.0000 - loss: 0.0144 - val_accuracy: 1.0000 - val_loss: 0.0058
Epoch 5/100
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 939us/step - accuracy: 1.0000 - loss: 0.0074 - val_accuracy: 1.0000 - val_loss: 0.0035
Epoch 6/100
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 940us/step - accuracy: 1.0000 - loss: 0.0051 - val_accuracy: 1.0000 - val_loss: 0.0022
Epoch 7/100
[1m60/60[0