## Download the Data

In [1]:
import os
import pandas as pd
import tarfile
import urllib.request
import matplotlib.pyplot as plt
import numpy as np

DOWNLOAD_URL = "https://github.com/fatimaezzahra-creator/Projet-ML/raw/refs/heads/main/datasets/adult.tgz"
DATASET_PATH = "datasets"

def fetch_data(data_url, data_path):
    if not os.path.isdir(data_path):
        os.makedirs(data_path)
    tgz_path = os.path.join(data_path, "adult.tgz")
    urllib.request.urlretrieve(data_url, tgz_path)
    tgz_file = tarfile.open(tgz_path)
    tgz_file.extractall(path=data_path)
    tgz_file.close()    

fetch_data(DOWNLOAD_URL, DATASET_PATH)

ModuleNotFoundError: No module named 'pandas'

## Dataset Exploratory Analysis

### Analysis of Form

In [None]:
#load the data
def load_data():
    csv_path = os.path.join(DATASET_PATH, "adult.data")
    return pd.read_csv(csv_path)

data = load_data()
data.info()

In [None]:
import seaborn as sns
#Missing Data Visualization
sns.heatmap(data.isna(), cbar=False)

We can see that the graph is all dark, which means there is no missing values in the data .

### Analysis of Content

In [None]:
#Target Distribution Analysis
data[" class"].value_counts(normalize=True)

In [None]:
import seaborn as sns
# Numerical attribute
for col in data.select_dtypes("int64"):
    sns.displot(data[col])

In [None]:
#Categorical attribute
import matplotlib.pyplot as plt
for col in data.select_dtypes("object"):
    sns.displot(data=data, x=col)
    plt.title(f" '{col}'", fontsize=16)
    plt.xlabel(col, fontsize=12)
    plt.ylabel("Nombre d'observations", fontsize=12)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
#copy of the data
df=data.copy()
# Supprimer les espaces des noms de colonnes et des valeurs
df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Creating subsets based on the target variable
class_0=df[df["class"] == "<=50K"]
class_1=df[df["class"] ==">50K"]
combined_df = (pd.concat([class_0, class_1]))

In [None]:
#relation target-age
import seaborn as sns
sns.histplot(
  data=combined_df ,
  x="age",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#relation target-education_num
sns.histplot(
  data=combined_df ,
  x="education-num",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#ralation target-education
sns.histplot(
  data=combined_df ,
  x="education",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
education_mapping = data.groupby(" education")[" education-num"].unique()
# Afficher le mapping pour vérifier la correspondance
for edu, edu_num in education_mapping.items():
    print(f"Education: {edu}, Education_Num: {edu_num}")

In [None]:
#relation target-workclass
sns.histplot(
  data=combined_df ,
  x="workclass",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#relation target-sex
sns.histplot(
  data=combined_df ,
  x="sex",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#relation target-marital status
sns.histplot(
  data=combined_df ,
  x="marital-status",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#relation target-relationship
sns.histplot(
  data=combined_df ,
  x="relationship",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
education_mapping = data.groupby(" marital-status")[" relationship"].unique()

# Afficher le mapping pour vérifier la correspondance
for mrs, rshp in education_mapping.items():
    print(f"marital-status: {mrs}, relationship: {rshp}")

In [None]:
#relation taget-fnlwgt
sns.histplot(
  data=combined_df ,
  x="fnlwgt",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
# relation target-race
sns.histplot(
  data=combined_df ,
  x="race",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");


In [None]:
#relation target-native country
sns.histplot(
  data=combined_df ,
  x="native-country",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#relation target-occupation
sns.histplot(
  data=combined_df ,
  x="occupation",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#rlation target-capital gain
sns.histplot(
  data=combined_df ,
  x="capital-gain",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#ralation target-capitalloss
sns.histplot(
  data=combined_df ,
  x="capital-loss",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#Capital Features Combination
dff=df.copy()
combined_df["capital_features"] = dff["capital-gain"] - combined_df["capital-loss"]


In [None]:
#relation target- hours per week
sns.histplot(
  data=combined_df ,
  x="hours-per-week",
  hue="class",
  stat="density",
  common_norm=False,
  palette="bright");

## Data Pre-Processing
### Train/Test Split

In [None]:
data[target_name].value_counts()
data.drop('education', axis=1)


We can see that the distribution of the target class is NOT balanced, so to create our train and test sets we can use a StratifiedShuffleSplit that will not only shuffle the instances but also preserve the proportions in the original dataset.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_indexes, test_indexes in split.split(data, data[target_name]):
    train_set = data.iloc[train_indexes]
    test_set = data.iloc[test_indexes]

print("Proportions in the original dataset:", data[target_name].value_counts(normalize=True))
print("Proportions in the train set:", train_set[target_name].value_counts(normalize=True))
print("Proportions in the test set:", test_set[target_name].value_counts(normalize=True))

### Separate target from the features

In [None]:
data = train_set.drop(target_name, axis=1)
target = train_set[target_name].copy()

numerical_features = data.select_dtypes(include=np.number).columns.tolist()
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

### Transformation of Text and Categorical Data
The feature `education` is the only one which implies some kind of order, so we can use an `OrdinalEncoder`.
The features `workclass`, `marital-status`, `relationship`, `race` and `sex` can all be handled by a `OneHotEncoder`.
The features `occupation` and `native-country` have very high cardinality. They will also be handled by a `OneHotEncoder` for now, but we will eventually find a better solution.

As for numerical features, we will only use a `StandardScaler`. 

In [None]:
# Transformation of Text and Categorical Data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ("categorical_not_ordinal", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("numerical", StandardScaler(), numerical_features)
])

## Model Selection
In this section, we will train 5 different models, evaluate them and compare to find the best one. They are:
* A LogisticRegression;
* An SGDClassifier;
* A RandomForestClassifier;
* A GradientBoostingClassifier;
* A KNeighboursClassifier;

First let's build each pipeline, with our previously defined preprocessor.

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

LR_clf = LogisticRegression(solver='liblinear', random_state=42)
SGD_clf = SGDClassifier(loss='hinge', random_state=42)
RF_clf = RandomForestClassifier(random_state=42)
GB_clf = GradientBoostingClassifier(random_state=42)
KNN_clf = KNeighborsClassifier()

LR_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LR_clf)
])
SGD_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGD_clf)
])
RF_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RF_clf)
])
GB_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GB_clf)
])
KNN_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNN_clf)
])

models = {
    'Logistic': LR_pipeline,
    'SGD': SGD_pipeline,
    'RandomForet': RF_pipeline,
    'GradientBoosting': GB_pipeline,
    'KNeighbours': KNN_pipeline 
}

Now we can define the parameter grid for each model.

In [None]:
LR_param_grid = {
    'classifier__penalty': ['l1','l2'],
    'classifier__C': [0.01, 0.1, 1],
    'classifier__max_iter' : [100, 500, 1000]
}

SGD_param_grid = {
        'classifier__learning_rate': ['constant', 'invscaling'],
        'classifier__eta0': [0.01, 0.1, 1],
        'classifier__penalty': ['l2', 'l1'],
        'classifier__alpha': [0.001, 0.01],
        'classifier__max_iter': [100, 500, 1000]
}

RF_param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [10, 50, 100]
}

GB_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0]
}

KNN_param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 15],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'classifier__leaf_size': [10, 20, 30, 40, 50],
    'classifier__p': [1, 2] 
}

param_grid = {
    'Logistic': LR_param_grid,
    'SGD': SGD_param_grid,
    'RandomForet': RF_param_grid,
    'GradientBoosting': GB_param_grid,
    'KNeighbours': KNN_param_grid 
}

Finally, we perform a GridSearchCV. Since the parameter grid are already pretty extensive, we will use only 5 folds.

In [None]:
from sklearn.model_selection import GridSearchCV

for name, model in models.items():
    print(f"For the {name} model:")
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid[name],
        cv=5,                   # Validação cruzada 5-fold
        scoring='accuracy',     # Métrica de avaliação
        n_jobs=-1,              # Uso de todos os núcleos disponíveis
    )
    grid_search.fit(data, target)
    print("    Best Hyperparams:", grid_search.best_params_)
    print("    Best Accuracy:", grid_search.best_score_)
