## Download the Data

In [2]:
import os
import pandas as pd
import tarfile
import urllib.request
import matplotlib.pyplot as plt
import numpy as np

DOWNLOAD_URL = "https://github.com/fatimaezzahra-creator/Projet-ML/raw/refs/heads/main/datasets/adult.tgz"
DATASET_PATH = "datasets"

def fetch_data(data_url, data_path):
    if not os.path.isdir(data_path):
        os.makedirs(data_path)
    tgz_path = os.path.join(data_path, "adult.tgz")
    urllib.request.urlretrieve(data_url, tgz_path)
    tgz_file = tarfile.open(tgz_path)
    tgz_file.extractall(path=data_path)
    tgz_file.close()    

fetch_data(DOWNLOAD_URL, DATASET_PATH)

KeyboardInterrupt: 

## Dataset Exploratory Analysis

### Analysis of Form

In [None]:
#load the data
def load_data():
    csv_path = os.path.join(DATASET_PATH, "adult.data")
    return pd.read_csv(csv_path)

data = load_data()
data.info()

In [None]:
import seaborn as sns
#Missing Data Visualization
sns.heatmap(data.isna(), cbar=False)

We can see that the graph is all dark, which means there is no missing values in the data .

### Analysis of Content

In [None]:
#Target Distribution Analysis
data[" class"].value_counts(normalize=True)

In [None]:
import seaborn as sns
# Numerical attribute
for col in data.select_dtypes("int64"):
    sns.displot(data[col])

In [None]:
#Categorical attribute
import matplotlib.pyplot as plt
for col in data.select_dtypes("object"):
    sns.displot(data=data, x=col)
    plt.title(f" '{col}'", fontsize=16)
    plt.xlabel(col, fontsize=12)
    plt.ylabel("Nombre d'observations", fontsize=12)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
#copy of the data
df=data.copy()
# Supprimer les espaces des noms de colonnes et des valeurs
df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Creating subsets based on the target variable
class_0=df[df["class"] == "<=50K"]
class_1=df[df["class"] ==">50K"]
combined_df = (pd.concat([class_0, class_1]))

In [None]:
#relation target-age
import seaborn as sns
sns.histplot(
  data=combined_df ,
  x="age",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#relation target-education_num
sns.histplot(
  data=combined_df ,
  x="education-num",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#ralation target-education
sns.histplot(
  data=combined_df ,
  x="education",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

**The features education (categorical) and education-num (numerical) may convey similar information, as they both represent the education level of an individual.**

In [None]:
education_mapping = data.groupby(" education")[" education-num"].unique()
# Afficher le mapping pour vérifier la correspondance
for edu, edu_num in education_mapping.items():
    print(f"Education: {edu}, Education_Num: {edu_num}")

The output of this mapping shows that each education category corresponds to a single unique value of education-num. This confirms that the two features are effectively encoding the same information.

In [None]:
#relation target-workclass
sns.histplot(
  data=combined_df ,
  x="workclass",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#relation target-sex
sns.histplot(
  data=combined_df ,
  x="sex",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#relation target-marital status
sns.histplot(
  data=combined_df ,
  x="marital-status",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#relation target-relationship
sns.histplot(
  data=combined_df ,
  x="relationship",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

**The features relationship and marital-status  might convey similar information because a person's relationship type often depends on their marital status.**



In [None]:
education_mapping = data.groupby(" marital-status")[" relationship"].unique()

# Afficher le mapping pour vérifier la correspondance
for mrs, rshp in education_mapping.items():
    print(f"marital-status: {mrs}, relationship: {rshp}")

The output reveals that for each value of marital-status, there are multiple possible values for relationship.
This variability indicates that a person's relationship cannot be uniquely determined based on their marital-status.

In [None]:
#relation taget-fnlwgt
sns.histplot(
  data=combined_df ,
  x="fnlwgt",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
# relation target-race
sns.histplot(
  data=combined_df ,
  x="race",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");


In [None]:
#relation target-native country
sns.histplot(
  data=combined_df ,
  x="native-country",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

The native-country  and race columns in the dataset contains many unique values, some of which have very low frequencies. Keeping all these rare categories can negatively impact the machine learning model due to:

Overfitting: The model may place undue importance on rare categories, learning patterns that don't generalize well to new data.
Increased Complexity: High cardinality increases the dimensionality during encoding (e.g., in one-hot encoding), which can slow down training and complicate the model unnecessarily.


In [None]:
#relation target-occupation
sns.histplot(
  data=combined_df ,
  x="occupation",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");
plt.xticks(rotation=90)
plt.show()

In [None]:
#rlation target-capital gain
sns.histplot(
  data=combined_df ,
  x="capital-gain",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

In [None]:
#ralation target-capitalloss
sns.histplot(
  data=combined_df ,
  x="capital-loss",
  hue="class",
  stat="density",
  common_norm=False,
  palette="muted");

**The combination of capital-gain and capital-loss into a single derived feature could have a stronger correlation with the target variable (class) than either capital-gain or capital-loss individually, potentially improving the predictive power of the model.**

In [None]:
#Capital Features Combination
dff=df.copy()
combined_df["capital_features"] = dff["capital-gain"] - combined_df["capital-loss"]
#Capital Features Combination
dff=df.copy()
from sklearn.preprocessing import LabelEncoder
#add the column of capital_feature to the data
dff["capital_net"]=(dff["capital-gain"] -dff["capital-loss"])
dff["ratio"]=(dff["capital-gain"] /(dff["capital-loss"]+0.00000001))
dff["capital_weighted"]=(dff["capital-gain"]*0.223329 +dff["capital-loss"]*0.150526)


# Encodage LabelEncoder pour chaque colonne catégorique
label_encoders = {}
for col in dff.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    dff[col] = le.fit_transform(dff[col])
    label_encoders[col] = le  # Stocker l'encodeur pour chaque colonne (optionnel, utile pour l'inverse_transform)

# Calcul de la matrice de corrélation
corr_matrix = dff.corr()
corr_matrix["class"].sort_values(ascending=False)

**capital_net: the net difference between gains and losses**

Correlation: 0.214, lower than capital-gain. Relevance: While intuitive (netting gains and losses), this feature does not add much value compared to capital-gain alone. Consider dropping it unless it improves model performance.

**ratio: relative proportion of gains to losses**

Correlation: 0.223, identical to capital-gain. Relevance: This feature does not improve upon capital-gain’s correlation. Its usefulness might depend on the model's capacity to interpret non-linear relationships, but it seems redundant for linear models.

**capital_weighted: weighted sum of the two based on their importance**

Correlation: 0.229, slightly higher than capital-gain (0.223). This feature combines the effects of both gains and losses, weighted by their individual correlations with class. It shows a slight improvement, suggesting it may capture some additional nuanced information. This feature is pertinent to keep for modeling.

In [None]:
#relation target- hours per week
sns.histplot(
  data=combined_df ,
  x="hours-per-week",
  hue="class",
  stat="density",
  common_norm=False,
  palette="bright");

**synthesis**

Retain education-num (numerical feature) and remove education to avoid redundancy and simplify the dataset.

Retain both features relationship and marital-status as they capture different aspects of an individual's social situation.

Focus on the most impactful features:
education-num, age, hours-per-week, capital_weighted, and categorical variables such as relationship and marital-status.

fnlwgt  adds minimal value to the predictive power of the model.

Reduce noise, we group all rare categories (those with a frequency below a certain threshold, e.g., 50 occurrences) into a single category called "Other".

## Data Pre-Processing


The feature `education` is the only one which implies some kind of order, so we can use an `OrdinalEncoder`.
The features `workclass`, `marital-status`, `relationship`, `race` and `sex` can all be handled by a `OneHotEncoder`.
The features `occupation` and `native-country` have very high cardinality. They will also be handled by a `OneHotEncoder` for now, but we will eventually find a better solution.

As for numerical features, we will only use a `StandardScaler`. 

In [None]:
# Transformation of Text and Categorical Data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

ordinal_features = ["education"]
not_ordinal_features = categorical_features.copy()
not_ordinal_features.remove("education")

preprocessor = ColumnTransformer([
    ("categorical_ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ordinal_features),
    ("categorical_not_ordinal", OneHotEncoder(handle_unknown="ignore"), not_ordinal_features),
    ("numerical", StandardScaler(), numerical_features)
])

data_prepared = preprocessor.fit_transform(data)

### Train/Test Split

In [84]:
data[target_name].value_counts()

class
<=50K    24720
>50K      7841
Name: count, dtype: int64

We can see that the distribution of the target class is NOT balanced, so to create our train and test sets we can use a StratifiedShuffleSplit that will not only shuffle the instances but also preserve the proportions in the original dataset.

In [85]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_indexes, test_indexes in split.split(data, data[target_name]):
    train_set = data.iloc[train_indexes]
    test_set = data.iloc[test_indexes]

print("Proportions in the original dataset:", data[target_name].value_counts(normalize=True))
print("Proportions in the train set:", train_set[target_name].value_counts(normalize=True))
print("Proportions in the test set:", test_set[target_name].value_counts(normalize=True))

Proportions in the original dataset: class
<=50K    0.75919
>50K     0.24081
Name: proportion, dtype: float64
Proportions in the train set: class
<=50K    0.759175
>50K     0.240825
Name: proportion, dtype: float64
Proportions in the test set: class
<=50K    0.759251
>50K     0.240749
Name: proportion, dtype: float64


### Separate target from the features

In [86]:
data = train_set.drop(target_name, axis=1)
target = train_set[target_name].copy()

numerical_features = data.select_dtypes(include=np.number).columns.tolist()
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

### Data Cleaning Process

**Removing Low-Impact Features** 

In [None]:
# Combine capital features in capital_weighted
df["capital_weighted"] = df["capital-gain"] * 0.223329 + df["capital-loss"] * 0.150526

# Delete the redundant columns
df.drop(["education", "fnlwgt", "capital-gain", "capital-loss"], axis=1, inplace=True)


**Grouping Rare Categories into 'Other' to Simplify Data**

In [None]:

# Count the frequency of each category 
native_country_counts = df['native-country'].value_counts()
race_counts = df['race'].value_counts()

# Identify categories to keep 
to_keep = native_country_counts[native_country_counts >= 500].index
to_keep_race = race_counts[race_counts >= 500].index

# Replace rare categories with "Other"
df['native-country'] = df['native-country'].apply(lambda x: x if x in to_keep else 'Other')
df['race'] = df['race'].apply(lambda x: x if x in to_keep_race else 'Other')

# Verify the updated frequencies
print(df['native-country'].value_counts())
print(df['race'].value_counts())

## Training Models

In [None]:
from sklearn.linear_model import LogisticRegression