### Import des libraires 

In [3]:
from snsynth import Synthesizer
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import plotly.graph_objects as go
from snsynth.mst import MSTSynthesizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from plotly.subplots import make_subplots
import statsmodels.api as sm
from scipy import stats




### Chargement du jeu de données Adult.data

In [4]:
#aldult.data
pums = pd.read_csv("adult.data", header=None, names=[
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", 
    "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"])

print(pums.head())  



   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

### On applique l'algorithme MST (maximum spanning tree)

#### epsilon = 1 (beaucoup de bruitage)

In [5]:
# Pour epsilon = 1 avec allocation correcte
synth_1 = Synthesizer.create("mst", epsilon=0.99, verbose=True)
synth_1.fit(pums, preprocessor_eps=0.01)  # Allouer 0.1 au préprocesseur
pums_synth_1 = synth_1.sample(32561)  # Allouer le reste au synthétiseur

Spent 0.01 epsilon on preprocessor, leaving 0.98 for training
Fitting with 8366677401600000 dimensions
Getting cliques
Estimating marginals


#### epsilon = 100 (un peu de bruitage)

In [6]:
epsilon_total = 100

# Pour epsilon = 100 avec allocation correcte
synth = Synthesizer.create("mst", epsilon=epsilon_total, verbose=True)
synth.fit(pums, preprocessor_eps=1.0)  # Allouer 1.0 au préprocesseur
pums_synth = synth.sample(32561)  # Allouer le reste au synthétiseur


Spent 1.0 epsilon on preprocessor, leaving 99.0 for training
Fitting with 8366677401600000 dimensions
Getting cliques
Estimating marginals


In [None]:
#print(pums_synth.head())

print(pums_synth_1.head())


   age   workclass    fnlwgt      education  education_num  \
0   62     Private  134348.8     Assoc-acdm             12   
1   53     Private  173670.4        HS-grad              9   
2   60     Private  252313.6   Some-college             10   
3   36   Local-gov  232652.8           10th              6   
4   32     Private  193331.2   Some-college             10   

        marital_status          occupation     relationship    race      sex  \
0             Divorced        Adm-clerical    Not-in-family   White     Male   
1   Married-civ-spouse     Farming-fishing          Husband   White     Male   
2              Widowed   Handlers-cleaners   Other-relative   White   Female   
3   Married-civ-spouse     Protective-serv          Husband   White     Male   
4             Divorced               Sales    Not-in-family   White     Male   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          0.05          0.05              45   United-States   <=50K  
1 

In [None]:
print(pums.head())


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [None]:
# Suppression des espaces et mise en minuscules 
pums['sex'] = pums['sex'].str.strip().str.lower()
pums_synth['sex'] = pums_synth['sex'].str.strip().str.lower()
pums_synth_1['sex'] = pums_synth_1['sex'].str.strip().str.lower()


female_data = pums[pums['sex'] == 'female']
female_data_synth = pums_synth[pums_synth['sex'] == 'female']
female_data_synth_1 = pums_synth_1[pums_synth_1['sex'] == 'female']

male_data = pums[pums['sex'] == 'male']
male_data_synth = pums_synth[pums_synth['sex'] == 'male']
male_data_synth_1 = pums_synth_1[pums_synth_1['sex'] == 'male']

#print(female_data.head())
print(female_data_synth_1.head())


    age          workclass    fnlwgt      education  education_num  \
2    60            Private  252313.6   Some-college             10   
6    22            Private  252313.6        HS-grad              9   
8    26   Self-emp-not-inc  173670.4        HS-grad              9   
18   51            Private   95027.2           11th              7   
21   42            Private  114688.0        HS-grad              9   

    marital_status          occupation     relationship    race     sex  \
2          Widowed   Handlers-cleaners   Other-relative   White  female   
6    Never-married       Other-service    Not-in-family   Black  female   
8    Never-married               Sales        Own-child   White  female   
18        Divorced        Craft-repair        Unmarried   White  female   
21        Divorced        Craft-repair        Unmarried   Black  female   

    capital_gain  capital_loss  hours_per_week  native_country  income  
2           0.95          0.05              45         

In [None]:
# La retraite à 60 ans
female_data = female_data[female_data['age'] <= 60]
female_data_synth = female_data_synth[female_data_synth['age'] <= 60]
female_data_synth_1 = female_data_synth_1[female_data_synth_1['age'] <= 60]


male_data = male_data[male_data['age'] <= 60]
male_data_synth = male_data_synth[male_data_synth['age'] <= 60]
male_data_synth_1 = male_data_synth_1[male_data_synth_1['age'] <= 60]



NameError: name 'female_data' is not defined

In [None]:


# Calculer la moyenne des heures par semaine pour les femmes par tranche d'âge
women_avg_hours = female_data.groupby('age')['hours_per_week'].mean().reset_index()
women_avg_hours_synth = female_data_synth.groupby('age')['hours_per_week'].mean().reset_index()
women_avg_hours_synth_1 = female_data_synth_1.groupby('age')['hours_per_week'].mean().reset_index()

# Calculer la moyenne des revenus pour les hommes par tranche d'âge
men_avg_income = male_data.groupby('age')['hours_per_week'].mean().reset_index()
men_avg_income_synth = male_data_synth.groupby('age')['hours_per_week'].mean().reset_index()
men_avg_income_synth_1 = male_data_synth_1.groupby('age')['hours_per_week'].mean().reset_index()

#print(female_data.head())
#print(female_data_synth.head())

print(women_avg_hours.mean())
print('-------------------')
print(women_avg_hours_synth.mean())
print('----------------')
print(women_avg_hours_synth_1.mean())


age               38.500000
hours_per_week    37.302894
dtype: float64
-------------------
age               38.500000
hours_per_week    40.580856
dtype: float64
----------------
age               38.500000
hours_per_week    40.591669
dtype: float64


In [None]:

# Créer un subplot avec 2 lignes
fig = make_subplots(rows=2, cols=1, 
                    subplot_titles=("Heures par semaine des Femmes", "Revenu des Hommes"))

# Ajout des heures par semaine pour les femmes (rose)
fig.add_trace(go.Scatter(
    x=women_avg_hours['age'], 
    y=women_avg_hours['hours_per_week'], 
    mode='lines+markers',
    name='Heures par semaine (Femmes)',
    line=dict(color='lightpink'),  # Rose clair
    marker=dict(color='pink'),       # Rose
), row=1, col=1)

# Ajout des heures par semaine pour les femmes dans synth (rose)
fig.add_trace(go.Scatter(
    x=women_avg_hours_synth['age'], 
    y=women_avg_hours_synth['hours_per_week'], 
    mode='lines+markers',
    name='Heures par semaine (Femmes Synth)',
    line=dict(color='red'),          # Rose
    marker=dict(color='lightpink')   # Rose clair
), row=1, col=1)

# Ajout des heures par semaine pour les femmes dans synth 1000 (rose)

fig.add_trace(go.Scatter(
    x = women_avg_hours_synth_1['age'],
    y = women_avg_hours_synth_1['hours_per_week'],
    mode = 'lines+markers',
    name = 'Heures par semaine (Femmes Synth 100)',
    line = dict(color = 'yellow'),
    marker = dict(color = 'yellow')
), row = 1, col = 1)

# Ajout des revenus pour les hommes (bleu)
fig.add_trace(go.Scatter(
    x=men_avg_income['age'], 
    y=men_avg_income['hours_per_week'], 
    mode='lines+markers',
    name='Heures par semaines (Hommes)',
    line=dict(color='blue'),         # Bleu
    marker=dict(color='blue')   # Bleu clair
), row=2, col=1)

# Ajout des revenus pour les hommes dans synth (bleu clair)
fig.add_trace(go.Scatter(
    x=men_avg_income_synth['age'], 
    y=men_avg_income_synth['hours_per_week'], 
    mode='lines+markers',
    name='Heures par semaines (Hommes Synth)',
    line=dict(color='lightblue'),    # Bleu clair
    marker=dict(color='lightblue')         # Bleu
), row=2, col=1)

# Ajout des revenus pour les hommes dans synth 1000 (bleu clair)
fig.add_trace(go.Scatter(
    x=men_avg_income_synth_1['age'],
    y=men_avg_income_synth_1['hours_per_week'],
    mode='lines+markers',
    name='Heures par semaines (Hommes Synth 100)',
    line=dict(color='green'),
    marker=dict(color='green')
), row=2, col=1)


# Mise en forme du graphique
fig.update_layout(
    title="Comparaison des heures par semaine des Femmes et des Hommes selon l'Âge",
    xaxis_title="Âge",
    yaxis_title="Moyenne des heures par semaine",
    legend_title="Variable",
    template="plotly_white"
)

# Affichage du graphique
fig.show()


In [None]:

# Créer un subplot avec 2 lignes et 1 colonne pour les histogrammes
fig = make_subplots(rows=2, cols=1, 
                    subplot_titles=("Distribution des Heures par semaine des Femmes", 
                                    "Distribution des Heures par semaine des Hommes"))

# Ajout de l'histogramme des heures par semaine pour les femmes (rose)
fig.add_trace(go.Histogram(
    x=women_avg_hours['hours_per_week'],  # Données des femmes
    name='Heures par semaine (Femmes)',
    marker_color='lightpink',  # Couleur rose
    opacity=0.7
), row=1, col=1)

# Ajout de l'histogramme des heures par semaine pour les femmes synthétiques (rouge)
fig.add_trace(go.Histogram(
    x=women_avg_hours_synth['hours_per_week'],  # Données synthétiques des femmes
    name='Heures par semaine (Femmes Synth)',
    marker_color='red',  # Couleur rouge
    opacity=0.5
), row=1, col=1)

# Ajout de l'histogramme des heures par semaine pour les femmes synthétiques 1000 (orange)
fig.add_trace(go.Histogram(
    x=women_avg_hours_synth_1['hours_per_week'],  # Données synthétiques 1000 des femmes
    name='Heures par semaine (Femmes Synth 100)',
    marker_color='orange',  # Couleur orange
    opacity=0.5
), row=1, col=1)

# Ajout de l'histogramme des heures par semaine pour les hommes (bleu)
fig.add_trace(go.Histogram(
    x=men_avg_income['hours_per_week'],  # Données des hommes
    name='Heures par semaine (Hommes)',
    marker_color='blue',  # Couleur bleue
    opacity=0.7
), row=2, col=1)

# Ajout de l'histogramme des heures par semaine pour les hommes synthétiques (bleu clair)
fig.add_trace(go.Histogram(
    x=men_avg_income_synth['hours_per_week'],  # Données synthétiques des hommes
    name='Heures par semaine (Hommes Synth)',
    marker_color='lightblue',  # Couleur bleu clair
    opacity=0.5
), row=2, col=1)

# Ajout de l'histogramme des heures par semaine pour les hommes synthétiques 1000 (vert)
fig.add_trace(go.Histogram(
    x=men_avg_income_synth_1['hours_per_week'],  # Données synthétiques 1000 des hommes
    name='Heures par semaine (Hommes Synth 100)',
    marker_color='green',  # Couleur verte
    opacity=0.5
), row=2, col=1)

# Mise en forme du graphique
fig.update_layout(
    title="Distribution des Heures par semaine selon le Sexe",
    xaxis_title="Ages des individus",
    yaxis_title="heures de travail moyennes\npar semaine",
    barmode='overlay',  # Superposer les histogrammes
    template="plotly_white"
)

# Affichage du graphique
fig.show()


## How does the correlation between education level (education_num), hours worked per week (hours_per_week), and the likelihood of earning >50K vary across different work classes (workclass)?

In [None]:
pums = pd.read_csv("adult.data", header=None, names=[
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", 
    "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"])


In [None]:
# Strip leading/trailing whitespaces from 'income' column and map to binary values
pums['income'] = pums['income'].str.strip().replace({'<=50K': 0, '>50K': 1})
pums_synth['income'] = pums_synth['income'].str.strip().replace({'<=50K': 0, '>50K': 1})
pums_synth_1['income'] = pums_synth_1['income'].str.strip().replace({'<=50K': 0, '>50K': 1})

# Group by workclass
grouped = pums.groupby('workclass')



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [None]:
print(pums['income'].value_counts())

income
0    24720
1     7841
Name: count, dtype: int64


In [None]:
# Non private data

In [None]:
#print(pums.head())

import statsmodels.formula.api as smf

# Fit a logistic regression model
model = smf.logit('income ~ age + education_num + hours_per_week + sex', data=pums).fit()
model_1 = smf.logit('income ~ age + education_num + hours_per_week + sex', data=pums_synth_1).fit()
model_100 = smf.logit('income ~ age + education_num + hours_per_week +sex', data=pums_synth).fit()

# Print the summary
print(model.summary())
#print('-------------------')
print(model_1.summary())
print('-------------------')
print(model_100.summary())






Optimization terminated successfully.
         Current function value: 0.428692
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521954
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.524477
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 income   No. Observations:                32561
Model:                          Logit   Df Residuals:                    32556
Method:                           MLE   Df Model:                            4
Date:                Thu, 03 Oct 2024   Pseudo R-squ.:                  0.2234
Time:                        13:42:10   Log-Likelihood:                -13959.
converged:                       True   LL-Null:                       -17974.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|    

In [None]:
import statsmodels.api as sm
from scipy import stats

df_model = model.df_model

null_deviance = -2 * model.llnull
print("Null deviance:", null_deviance)

# residual deviance = -2 * log likelihood du modèle complet
residual_deviance = -2 * model.llf
print("Residual deviance:", residual_deviance)

chi2_stat = null_deviance - residual_deviance

# Calcul du p-value
p_value = 1 - stats.chi2.cdf(chi2_stat, df_model)

print(f"Chi-deux Statistic: {chi2_stat:.2f}")
print(f"Degrees of Freedom: {df_model}")
print(f"P-value: {p_value:.3e}")

# Interprétation
if p_value < 0.05:
    print("Le modèle est significatif au niveau de 5%.")
else:
    print("Le modèle n'est pas significatif au niveau de 5%.")



Null deviance: 35948.079435306325
Residual deviance: 27917.30608744315
Chi-deux Statistic: 8030.77
Degrees of Freedom: 4.0
P-value: 0.000e+00
Le modèle est significatif au niveau de 5%.


In [None]:


df_model_1 = model_1.df_model

null_deviance = -2 * model_1.llnull
print("Null deviance:", null_deviance)

# residual deviance = -2 * log likelihood du modèle complet
residual_deviance = -2 * model_1.llf
print("Residual deviance:", residual_deviance)

chi2_stat = null_deviance - residual_deviance

# Calcul du p-value
p_value = 1 - stats.chi2.cdf(chi2_stat, df_model_1)

print(f"Chi-deux Statistic: {chi2_stat:.2f}")
print(f"Degrees of Freedom: {df_model_1}")
print(f"P-value: {p_value:.3e}")

# Interprétation
if p_value < 0.05:
    print("Le modèle est significatif au niveau de 5%.")
else:
    print("Le modèle n'est pas significatif au niveau de 5%.")



Null deviance: 35835.14738780934
Residual deviance: 33990.65914360038
Chi-deux Statistic: 1844.49
Degrees of Freedom: 4.0
P-value: 0.000e+00
Le modèle est significatif au niveau de 5%.


In [None]:

df_model_100 = model_100.df_model

null_deviance = -2 * model_100.llnull
print("Null deviance:", null_deviance)

# residual deviance = -2 * log likelihood du modèle complet
residual_deviance = -2 * model_100.llf
print("Residual deviance:", residual_deviance)

chi2_stat = null_deviance - residual_deviance

# Calcul du p-value
p_value = 1 - stats.chi2.cdf(chi2_stat, df_model_100)

print(f"Chi-deux Statistic: {chi2_stat:.2f}")
print(f"Degrees of Freedom: {df_model_100}")
print(f"P-value: {p_value:.3e}")

# Interprétation
if p_value < 0.05:
    print("Le modèle est significatif au niveau de 5%.")
else:
    print("Le modèle n'est pas significatif au niveau de 5%.")



Null deviance: 35950.375761232055
Residual deviance: 34154.96226198315
Chi-deux Statistic: 1795.41
Degrees of Freedom: 4.0
P-value: 0.000e+00
Le modèle est significatif au niveau de 5%.
