# Thesis Notebook
This notebook shows the pipeline of the thesis, using student performance datasets (UCI and others).
It includes:
- Data import from datasets
- Error handling
- Exploratory Data Analysis (EDA)
- Preprocessing and Feature Engineering
- Dimensionality Reduction
- Feature Selection (RFE, GA, PSO)
- Clustering (Popularity Architecture)
- Supervised Models (Parallel Architecture)
- Neural Networks (various architectures)
- Results benchmark and summary


In [2]:
!pip install tensorflow
!pip install keras
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras import models, layers, callbacks

# Optional libraries
try:
    import prince
except:
    prince = None
try:
    from kmodes.kmodes import KModes
    from kmodes.kprototypes import KPrototypes
except:
    KModes = None
try:
    import xgboost as xgb
except:
    xgb = None

sns.set(style="whitegrid")
RANDOM_STATE = 42




In [3]:

# === DATA IMPORT ===

def load_student_performance_uci(base_url="https://archive.ics.uci.edu/ml/machine-learning-databases/00320/"):
    url_mat = base_url + "student-mat.csv"
    url_por = base_url + "student-por.csv"
    try:
        df_mat = pd.read_csv(url_mat, sep=';')
        df_por = pd.read_csv(url_por, sep=';')
        return {"math": df_mat, "por": df_por}
    except Exception as e:
        print("Error loading UCI Student Performance:", e)
        return {}

datasets = load_student_performance_uci()

# === EDA FUNCTIONS ===

def clean_and_eda(df, name="dataset"):
    print(f"\n=== EDA for {name} ===")
    print("Shape:", df.shape)
    print("Columns + types:")
    print(df.dtypes)
    print("Missing values per column:")
    print(df.isnull().sum())
    ndup = df.duplicated().sum()
    print("Duplicate rows:", ndup)
    if ndup > 0:
        df = df.drop_duplicates()
        print("Dropped duplicates. New shape:", df.shape)
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if num_cols:
        print("Numeric summary:")
        print(df[num_cols].describe().T)
    cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
    if cat_cols:
        for c in cat_cols[:5]:  # show first 5 categorical summaries
            print(f"--- {c} ---")
            print(df[c].value_counts().head(10))
    if num_cols:
        col = num_cols[0]
        plt.figure(figsize=(6,4))
        sns.histplot(df[col].dropna(), bins=30, kde=True)
        plt.title(f"{name}: distribution of {col}")
        plt.show()
    if len(num_cols) > 1:
        plt.figure(figsize=(8,6))
        corr = df[num_cols].corr()
        sns.heatmap(corr, annot=False, cmap="coolwarm")
        plt.title(f"{name}: numeric feature correlation")
        plt.show()
    return df

cleaned = {}
for key, df in datasets.items():
    if df is not None:
        cleaned[key] = clean_and_eda(df, name=key)


Error loading UCI Student Performance: HTTP Error 404: Not Found


## Thesis Pipeline Continuation
Below, you would integrate the full thesis pipeline including:
- Preprocessing (encoding, scaling)
- PCA / MCA / FAMD
- Feature Selection (RFE, GA, PSO)
- Parallel Models (RF, KNN, SVM, etc.)
- Neural Network architectures (NN18, NN38, etc.)
- Popularity Architecture (clustering + local models)
- Final Benchmark

*This part can be copied from the previously generated full pipeline code.*