In [None]:
# packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.mosaicplot import mosaic
%matplotlib inline


In [None]:
# import data and preprocess data 
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
          'hours-per-week', 'native-country', 'income']
income_data_file = 'adult/adult.data'
raw_data = pd.read_csv(income_data_file, names=columns, na_values=['?', ' ?'], sep=', ', engine='python')
#raw_data.info() # 32561 entries, 0 to 32560
raw_data.dropna(inplace= True)
# raw_data.info()  # 30162 entries, 0 to 32560
above_50K = raw_data[raw_data['income'] == '>50K']
below_equal_50K = raw_data[raw_data['income'] == '<=50K']
#above_50K.info()        # 7508 entries, 7 to 32560
#above_50K.head(20)
#below_equal_50K.info()  # 22654 entries, 0 to 32559
#below_equal_50K.head(20)



In [None]:
# User Story 1: relationship between income and age/education 

# Figure 1(a): income vs age_group
# User Story 1: income versus age group 
raw_data['age_group'] = np.nan
raw_data.loc[(raw_data['age'] >= 18) & (raw_data['age'] <= 30), 'age_group'] = '18-30'
raw_data.loc[(raw_data['age'] >= 31) & (raw_data['age'] <= 40), 'age_group'] = '31-40'
raw_data.loc[(raw_data['age'] >= 41) & (raw_data['age'] <= 50), 'age_group'] = '41-50'
raw_data.loc[(raw_data['age'] >= 51) & (raw_data['age'] <= 60), 'age_group'] = '51-60'
raw_data.loc[(raw_data['age'] >= 61) & (raw_data['age'] <= 70), 'age_group'] = '61-70'
raw_data.loc[(raw_data['age'] >= 71), 'age_group'] = 'Above 70'
#raw_data.head(20)
plt.figure(figsize=(20,15))
sns.countplot(raw_data, x=raw_data['age_group'],order=['18-30','31-40','41-50','51-60','61-70','Above 70'],hue='income',palette='pastel')
plt.title("Age Group versus Individual Incomes",fontsize=18,fontweight='bold')
plt.ylabel("Number of Individuals")
plt.xlabel("Age Groups")
plt.legend(fontsize=20)

# Figure 1(b): income vs education
plt.figure(figsize=(20,15))
sns.countplot(raw_data,x='education',hue='income')
plt.title("Education Duration versus Individual Income")
plt.ylabel("Number of Individuals")
plt.xlabel("Education Level")
plt.legend(fontsize=20)

# Figure 2(a): Pie chart of different education levels of people with income > 50K

above_counter = Counter(above_50K['education'])
plt.figure(figsize=(20,15))
plt.pie(above_counter.values(),labels=above_counter.keys(),autopct='%1.0f%%',wedgeprops={'linewidth':3,'edgecolor':'white'})
plt.title("Pie chart for different education levels of people with income > 50K")
plt.legend()
plt.show()

# Figure 2(b): Pie chart of different education levels of people with income <= 50K
below_queal_counter = Counter(below_equal_50K['education'])
plt.figure(figsize=(20,15))
plt.pie(below_queal_counter.values(),labels=below_queal_counter.keys(),autopct='%1.0f%%',wedgeprops={'linewidth':3,'edgecolor':'white'})
plt.title("Pie chart for different education levels of people with income <= 50K")
plt.legend()
plt.show()

In [None]:
# User Story 2: income versus maritus/occupation
# Figure 3(a): income versus maritus 
plt.figure(figsize=(20,16))
sns.countplot(raw_data,x='marital-status',hue='income')
plt.title("Income of Individuals versus  Marital-Status",fontsize=18,fontweight='bold')
plt.ylabel("Number of Individuals")
plt.legend(fontsize=20)

# Figure 3(b): income versus occupation
plt.figure(figsize=(20,16))
sns.countplot(raw_data,x='occupation',hue='income')
plt.title("Income of Individuals versus  Occupation",fontsize=18,fontweight='bold')
plt.ylabel("Number of Individuals")
plt.legend(fontsize=20)



In [None]:
# Users Story 3: income versus (Capitial-gain, Education-num, Age)
# Figure 4: Parallel Coordinate Plot

df = pd.read_csv(income_data_file, sep=", ", na_values=['?', ' ?'], header=None, engine='python')
df.dropna(inplace=True)
df.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", \
              "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
below_50K_df = df[df["income"] == "<=50K"]
above_50K_df = df[df["income"] == ">50K"]
df = pd.concat([above_50K_df, below_50K_df])
df['class'] = (df["income"] == ">50K")*1

data_df = df[['education-num', 'age', 'capital-gain', 'fnlwgt', 'class']].copy()
np_array = MinMaxScaler().fit_transform(data_df.values)
data_df = pd.DataFrame(np_array)
df.index = data_df.index
data_df['income'] = df['income']
data_df.columns = ['capital-gain', 'education-num', 'age', 'fnlwgt','class', 'income']
data_df_below_50K = data_df[data_df["class"] == 0.0].sample(n=50)
data_df_above_50K = data_df[data_df["class"] == 1.0].sample(n=50)
data_df = pd.concat([data_df_below_50K, data_df_above_50K])
pd.plotting.parallel_coordinates(data_df, 'income', cols=['education-num', 'age', 'capital-gain','fnlwgt'],color=('#FF0000',
                         '#FFD700'))
plt.show()

# Figure 3(c): income versus maritus mosaic 
#plt.close()
fig_handle, coordinate = plt.subplots(ncols=1,nrows=1,figsize=(20,15))
fig_handle.subplots_adjust(hspace=.5)
mosaic(df,['marital-status','income'],ax=coordinate, axes_label=False)
plt.show()


# Figure 3(d): income versus occupation mosaic 
#plt.close()
fig_handle, coordinate  = plt.subplots(ncols=1,nrows=1,figsize=(20,15))
fig_handle.subplots_adjust(hspace=.5)
mosaic(df,['occupation','income'],ax=coordinate, axes_label=False)
plt.show()


In [None]:
# User Story 4: scatterplot between capital-gain, age, education-num 
fig_handle, coordinate = plt.subplots(ncols=1,nrows=1,figsize=(15,11))
fig_handle.subplots_adjust(hspace=.6)
df_above = above_50K_df[['capital-gain','age','education-num']]
pd.plotting.scatter_matrix(df_above,ax=coordinate)
plt.show()


In [None]:
# User Story 5: scatterplot between capital-gain, age, hours-per-week 
fig_handle, coordinate = plt.subplots(ncols=1,nrows=1,figsize=(15,11))
fig_handle.subplots_adjust(hspace=.6)
df_above = above_50K_df[['capital-gain','age','hours-per-week']]
pd.plotting.scatter_matrix(df_above,ax=coordinate)
plt.show()
