## UCI Adult Dataset - Exploratory and Decriptive Analysis  
This notebooks is focused on the exploration , Exploratory and Decriptive Analysis for the UCI Adult Income Dataset.

In [1]:
# Import libraries 
import os
import pandas as pd
import numpy as np
import plotly.express as px

## Define and Create Paths

In [66]:
# Get working directory 
current_dir = os.getcwd()

# Go one directory up to the root directory 
project_root_dir = os.path.dirname(current_dir)
data_dir = os.path.join(project_root_dir, 'data')
raw_dir = os.path.join(data_dir,'raw')
processed_dir = os.path.join(data_dir,'processed')
# Define paths to results folder 
results_dir = os.path.join(project_root_dir,'results')
# Define paths to docs folder 
docs_dir = os.path.join(project_root_dir,'docs') 

#Create directories if they do not exist 
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(results_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

##  Read in the data

In [3]:
adult_data_filename= os.path.join(processed_dir, "adult_cleaned.csv")
adult_df = pd.read_csv(adult_data_filename)
adult_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income,education_level,occupation_grouped,native_region,age_group
0,39,government,77516,13,single,single,white,male,2174,0,40,<=50k,tertiary,white collar,north america,36-45
1,50,self-emp-not-inc,83311,13,married,male spouse,white,male,0,0,13,<=50k,tertiary,white collar,north america,46-60
2,38,private,215646,9,divorced or separated,single,white,male,0,0,40,<=50k,secondary-high school graduate,blue collar,north america,36-45
3,53,private,234721,7,married,male spouse,black,male,0,0,40,<=50k,secondary,blue collar,north america,46-60
4,28,private,338409,13,married,female spouse,black,female,0,0,40,<=50k,tertiary,white collar,central america,26-35
5,37,private,284582,14,married,female spouse,white,female,0,0,40,<=50k,tertiary,white collar,north america,36-45
6,49,private,160187,5,divorced or separated,single,black,female,0,0,16,<=50k,secondary,service,central america,46-60
7,52,self-emp-not-inc,209642,9,married,male spouse,white,male,0,0,45,>50k,secondary-high school graduate,white collar,north america,46-60
8,31,private,45781,14,single,single,white,female,14084,0,50,>50k,tertiary,white collar,north america,26-35
9,42,private,159449,13,married,male spouse,white,male,5178,0,40,>50k,tertiary,white collar,north america,36-45


## check the shape of the dataset and datatytypes

In [4]:
adult_df.shape

(32513, 16)

In [5]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32513 entries, 0 to 32512
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 32513 non-null  int64 
 1   workclass           32513 non-null  object
 2   fnlwgt              32513 non-null  int64 
 3   education_num       32513 non-null  int64 
 4   marital_status      32513 non-null  object
 5   relationship        32513 non-null  object
 6   race                32513 non-null  object
 7   sex                 32513 non-null  object
 8   capital_gain        32513 non-null  int64 
 9   capital_loss        32513 non-null  int64 
 10  hours_per_week      32513 non-null  int64 
 11  income              32513 non-null  object
 12  education_level     32513 non-null  object
 13  occupation_grouped  32513 non-null  object
 14  native_region       32513 non-null  object
 15  age_group           32513 non-null  object
dtypes: int64(6), object(10

## summary statistics 
### Numerical variables 

In [6]:
adult_df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32513.0,32513.0,32513.0,32513.0,32513.0,32513.0
mean,38.590256,189794.2,10.081629,1079.239812,87.432719,40.440962
std,13.638932,105578.8,2.572015,7390.62565,403.243596,12.350184
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117833.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


**Categorical Variables**

In [7]:
adult_df.describe(include='object')

Unnamed: 0,workclass,marital_status,relationship,race,sex,income,education_level,occupation_grouped,native_region,age_group
count,32513,32513,32513,32513,32513,32513,32513,32513,32513,32513
unique,8,4,5,5,2,2,7,5,6,7
top,private,married,male spouse,white,male,<=50k,secondary-high school graduate,white collar,north america,26-35
freq,22650,14984,13178,27771,21758,24677,10484,16532,30018,8501


In [8]:
adult_df['workclass'].value_counts(normalize=True)

workclass
private             0.696644
self-emp-not-inc    0.078123
government          0.069418
local-gov           0.064374
unknown             0.056470
self-employed       0.034325
voluntary           0.000431
unemployed          0.000215
Name: proportion, dtype: float64

In [9]:
adult_df['marital_status'].value_counts(normalize=True)

marital_status
married                  0.460862
single                   0.327684
divorced or separated    0.180912
widowed                  0.030542
Name: proportion, dtype: float64

In [10]:
adult_df['relationship'].value_counts(normalize=True)

relationship
male spouse          0.405315
single               0.360686
child                0.155599
female spouse        0.048227
extended relative    0.030173
Name: proportion, dtype: float64

In [11]:
adult_df['race'].value_counts(normalize=True)

race
white                        0.854151
black                        0.096023
asian or pacific islander    0.031926
american indian or eskimo    0.009565
other                        0.008335
Name: proportion, dtype: float64

**Income Distribution**

In [12]:
adult_df_income =adult_df.groupby('income').size().reset_index(name='total')
adult_df_income

Unnamed: 0,income,total
0,<=50k,24677
1,>50k,7836


In [78]:
pip install --upgrade plotly





In [82]:
fig = px.pie(adult_df_income,
             names='income',
             values='total',
             title='Overall Income Distribution',
             color_discrete_sequence=px.colors.sequential.RdBu)

fig.update_layout(template="presentation",
                  paper_bgcolor="rgba(0,0,0,0)",
                  plot_bgcolor="rgba(0,0,0,0)") 

fig.show()
fig.write_image(os.path.join(results_dir, 'income_distribution_pie_chart.jpg'))
fig.write_image(os.path.join(results_dir, 'income_distribution_pie_chart.png'))
html_str = fig.to_html()

with open(os.path.join(results_dir, 'income_distribution_pie_chart.html'), 'w', encoding='utf-8') as f:
    f.write(html_str)



**Income by Age Group**

In [14]:
adult_df_income_age = adult_df.groupby(['age_group', 'income']).size().reset_index(name='total_by_age').sort_values(['age_group', 'income'])
adult_df_income_age

Unnamed: 0,age_group,income,total_by_age
0,18-25,<=50k,5333
1,18-25,>50k,114
2,26-35,<=50k,6910
3,26-35,>50k,1591
4,36-45,<=50k,5230
5,36-45,>50k,2771
6,46-60,<=50k,4479
7,46-60,>50k,2809
8,61-75,<=50k,1580
9,61-75,>50k,511


In [15]:
total_per_group = adult_df_income_age.groupby('age_group').size()
total_per_group

age_group
18-25    2
26-35    2
36-45    2
46-60    2
61-75    2
76+      2
<18      1
dtype: int64

In [16]:
total_per_group = adult_df_income_age.groupby('age_group')['total_by_age'].transform('sum')
total_per_group

0     5447
1     5447
2     8501
3     8501
4     8001
5     8001
6     7288
7     7288
8     2091
9     2091
10     240
11     240
12     945
Name: total_by_age, dtype: int64

In [17]:
total_per_group = adult_df_income_age.groupby('age_group')['total_by_age'].transform('sum')
adult_df_income_age['percentage'] = (adult_df_income_age['total_by_age']/total_per_group) * 100
adult_df_income_age

Unnamed: 0,age_group,income,total_by_age,percentage
0,18-25,<=50k,5333,97.907105
1,18-25,>50k,114,2.092895
2,26-35,<=50k,6910,81.284555
3,26-35,>50k,1591,18.715445
4,36-45,<=50k,5230,65.366829
5,36-45,>50k,2771,34.633171
6,46-60,<=50k,4479,61.45719
7,46-60,>50k,2809,38.54281
8,61-75,<=50k,1580,75.561932
9,61-75,>50k,511,24.438068


In [81]:
fig = px.bar(
    adult_df_income_age,
    x='age_group',
    y='total_by_age',   
    color='income',
    title='Income Distribution by Age Group (%)',
    barmode='group',
    color_discrete_sequence=px.colors.sequential.RdBu,
    text='percentage'
)
fig.update_traces(texttemplate='%{text..2f}%', textposition='outside'),
fig.update_layout(template="presentation", xaxis_title='Age Group',
                  yaxis_title='Percentage of population', legend_title=dict(text='Income Level'),
                  paper_bgcolor = "rgba(0, 0, 0, 0)", plot_bgcolor = "rgba(0, 0, 0, 0)")
fig.show()
fig.write_image(os.path.join(results_dir, 'income_distribution_by_agegroup_bar_plot.jpg'))
fig.write_image(os.path.join(results_dir, 'income_distribution_by_agegroup_bar_plot.png'))
html_str = fig.to_html()

with open(os.path.join(results_dir, 'income_distribution_by_agegroup_bar_plot.html'), 'w', encoding='utf-8') as f:
    f.write(html_str)

In [19]:
themes = ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "presentation", "xgridoff", "ygridoff", "gridon", "none"]

for theme in themes:
    fig.update_layout(template=theme)
    fig.show()


In [22]:
#pip install -U kaleido

In [23]:
#pip install -U plotly

**Income by Native Region** 

In [25]:
adult_df_income_native_region = adult_df.groupby(['native_region', 'income']).size().reset_index(name='total_income_distribution')
adult_df_income_native_region

Unnamed: 0,native_region,income,total_income_distribution
0,asia,<=50k,465
1,asia,>50k,206
2,central america,<=50k,466
3,central america,>50k,58
4,europe,<=50k,369
5,europe,>50k,152
6,north america,<=50k,22768
7,north america,>50k,7250
8,other,<=50k,435
9,other,>50k,146


In [30]:
total_per_group = adult_df_income_native_region.groupby('native_region').size()
total_per_group


native_region
asia               2
central america    2
europe             2
north america      2
other              2
south america      2
dtype: int64

In [32]:
total_per_group = adult_df_income_native_region.groupby('native_region')['total_income_distribution'].transform('sum')
total_per_group

0       671
1       671
2       524
3       524
4       521
5       521
6     30018
7     30018
8       581
9       581
10      198
11      198
Name: total_income_distribution, dtype: int64

In [34]:
total_per_group = adult_df_income_native_region.groupby('native_region')['total_income_distribution'].transform('sum')
adult_df_income_native_region['percentage'] = (adult_df_income_native_region['total_income_distribution']/total_per_group) * 100
adult_df_income_native_region

Unnamed: 0,native_region,income,total_income_distribution,percentage
0,asia,<=50k,465,69.299553
1,asia,>50k,206,30.700447
2,central america,<=50k,466,88.931298
3,central america,>50k,58,11.068702
4,europe,<=50k,369,70.825336
5,europe,>50k,152,29.174664
6,north america,<=50k,22768,75.847825
7,north america,>50k,7250,24.152175
8,other,<=50k,435,74.870912
9,other,>50k,146,25.129088


In [83]:
fig = px.bar(
    adult_df_income_native_region,
    x='native_region',
    y='percentage',
    color='income',
    title='Income Distribution by Native Region (%)',
    barmode='group',
    height=500,
    color_discrete_sequence=px.colors.sequential.RdBu,
    text='percentage'
)


fig.update_traces(
    texttemplate='%{text:.2f}%',
    textposition='outside'  
)

fig.update_layout(
    template="presentation",
    xaxis_title='Native Region',
    yaxis_title='Percentage of Population',
    legend_title_text='Income Level',
    xaxis_title_standoff=30,
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()
fig.write_image(os.path.join(results_dir, 'income_distribution_by_Native Region_bar_plot.jpg'))
fig.write_image(os.path.join(results_dir, 'income_distribution_by_Native Region_bar_plot.png'))
html_str = fig.to_html()

with open(os.path.join(results_dir, 'income_distribution_by_Native Region_bar_plot.html'), 'w', encoding='utf-8') as f:
    f.write(html_str)


**Income by Race** 

In [44]:
adult_df_income_race = adult_df.groupby(['race', 'income']).size().reset_index(name='total_by_race')
adult_df_income_race

Unnamed: 0,race,income,total_by_race
0,american indian or eskimo,<=50k,275
1,american indian or eskimo,>50k,36
2,asian or pacific islander,<=50k,762
3,asian or pacific islander,>50k,276
4,black,<=50k,2735
5,black,>50k,387
6,other,<=50k,246
7,other,>50k,25
8,white,<=50k,20659
9,white,>50k,7112


In [45]:
total_per_group = adult_df_income_race.groupby('race').size()
total_per_group

race
american indian or eskimo    2
asian or pacific islander    2
black                        2
other                        2
white                        2
dtype: int64

In [46]:
total_per_group = adult_df_income_race.groupby('race')['total_by_race'].transform('sum')
total_per_group

0      311
1      311
2     1038
3     1038
4     3122
5     3122
6      271
7      271
8    27771
9    27771
Name: total_by_race, dtype: int64

In [47]:
total_per_group = adult_df_income_race.groupby('race')['total_by_race'].transform('sum')
adult_df_income_race['percentage'] = (adult_df_income_race['total_by_race']/total_per_group) * 100
adult_df_income_race

Unnamed: 0,race,income,total_by_race,percentage
0,american indian or eskimo,<=50k,275,88.424437
1,american indian or eskimo,>50k,36,11.575563
2,asian or pacific islander,<=50k,762,73.410405
3,asian or pacific islander,>50k,276,26.589595
4,black,<=50k,2735,87.6041
5,black,>50k,387,12.3959
6,other,<=50k,246,90.774908
7,other,>50k,25,9.225092
8,white,<=50k,20659,74.390551
9,white,>50k,7112,25.609449


In [84]:
fig = px.bar(
    adult_df_income_race,
    x = 'race',
    y = 'percentage',
    color = 'income',
    title='Income Distribution Per Race ',
    barmode='group',
    color_discrete_sequence=px.colors.sequential.RdBu,
    text='percentage'
)
fig.update_traces(texttemplate = '%{text:.2f}%')
fig.show()
fig.write_image(os.path.join(results_dir, 'income_distribution_by_Race_bar_plot.jpg'))
fig.write_image(os.path.join(results_dir, 'income_distribution_by_Race_bar_plot.png'))
html_str = fig.to_html()

with open(os.path.join(results_dir, 'income_distribution_by_Race_bar_plot.html'), 'w', encoding='utf-8') as f:
    f.write(html_str)

**Education and occupation**

In [53]:
adult_df_income_edu_occ = (adult_df.groupby(['education_level', 'occupation_grouped', 'income'])
                          .size().reset_index(name='total').sort_values('total', ascending = False))
adult_df_income_edu_occ

Unnamed: 0,education_level,occupation_grouped,income,total
44,secondary-high school graduate,blue collar,<=50k,3976
63,tertiary,white collar,>50k,3545
62,tertiary,white collar,<=50k,3369
32,same college,white collar,<=50k,3003
52,secondary-high school graduate,white collar,<=50k,2900
...,...,...,...,...
21,primary,unknow,>50k,4
14,preschool,white collar,<=50k,3
37,secondary,military,>50k,2
11,preschool,military,<=50k,2


In [54]:
adult_df_income_edu_occ['edu_occ'] = (adult_df_income_edu_occ['education_level'] + " | "
                                     + adult_df_income_edu_occ['occupation_grouped'])
adult_df_income_edu_occ

Unnamed: 0,education_level,occupation_grouped,income,total,edu_occ
44,secondary-high school graduate,blue collar,<=50k,3976,secondary-high school graduate | blue collar
63,tertiary,white collar,>50k,3545,tertiary | white collar
62,tertiary,white collar,<=50k,3369,tertiary | white collar
32,same college,white collar,<=50k,3003,same college | white collar
52,secondary-high school graduate,white collar,<=50k,2900,secondary-high school graduate | white collar
...,...,...,...,...,...
21,primary,unknow,>50k,4,primary | unknow
14,preschool,white collar,<=50k,3,preschool | white collar
37,secondary,military,>50k,2,secondary | military
11,preschool,military,<=50k,2,preschool | military


In [85]:
num = 15
fig = px.bar(
    adult_df_income_edu_occ.head(15),
    x = 'total',
    y = 'edu_occ',
    color = 'income',
    orientation = 'h',
    title = f'Top{num} Education and Occupation Groups Combinations by Income Group',
    #barmode = 'group',
    height = 700,
    width=1100,
    color_discrete_sequence=px.colors.sequential.RdBu,
    text = 'total'
)

fig.update_layout(template="presentation", xaxis_title='Number of Individuals',
                  yaxis_title='Education | Occupation Group',
                  legend_title=dict(text='Income Level'),
                margin=dict(l=450, r=50, t= 50, b=50))
fig.update_traces(textposition='inside')
    
fig.show()
fig.write_image(os.path.join(results_dir, 'income_distribution_by_Education and Occupation_bar_plot.jpg'))
fig.write_image(os.path.join(results_dir, 'income_distribution_by_Education and Occupation_bar_plot.png'))
html_str = fig.to_html()

with open(os.path.join(results_dir, 'income_distribution_by_Education and Occupation_bar_plot.html'), 'w', encoding='utf-8') as f:
    f.write(html_str)