In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data Ingest: Read from BigQuery

In [None]:
QUERY = 'SELECT * FROM `data-describe.census_income.adult_data`'

df = pd.read_gbq(QUERY)

In [None]:
df.head()

## Data Ingest: Read From Cloud Storage

In [None]:
df = pd.read_csv('gs://amazing-public-data/census_income/census_income_data_adult.data')

In [None]:
df.head()

Check no. of rows and columns

In [None]:
df.shape

Repurpose target column to whether (or not) income is **less** than 50K i.e. 1 for Yes, 0 for No

In [None]:
df['income'] = df['income'].map({' <=50K': 0,' >50K': 1})

Check cardinalilty of different columns i.e.How many distinct values are there?

In [None]:
df['income'].value_counts(normalize=True)

In [None]:
df['workclass'].value_counts()

In [None]:
df['education'].value_counts()

In [None]:
df['marital-status'].value_counts()

In [None]:
df['occupation'].value_counts()

In [None]:
df['relationship'].value_counts()

In [None]:
df['race'].value_counts()

In [None]:
df['sex'].value_counts()

In [None]:
df['native-country'].value_counts()

Check **correlations** (between numerical columns)

In [None]:
from seaborn import heatmap

fig, ax = plt.subplots(figsize=(8, 8))
ax=heatmap(df.corr(),
           annot=True,
           ax=ax,
           cmap="RdBu_r",
           vmin=-1,
           vmax=1)

We see there are many corrupt values as '?', we will replace them with null values

In [None]:
df.replace({' ?':np.nan},
           inplace=True)

Check basic stats of numerical columns e.g. average, standard deviation, quartile values etc.

In [None]:
df.describe()

Check percentage of null values

In [None]:
100 * df.isnull().sum()[df.isnull().sum()>0] / df.shape[0]

Impute missing values with mode i.e. the most frequent value in that column

In [None]:
df['workclass'].fillna(df['workclass'].mode().values[0],
                       inplace=True)
df['occupation'].fillna(df['occupation'].mode().values[0],
                        inplace=True)
df['native-country'].fillna(df['native-country'].mode().values[0],
                            inplace=True)

Generate pair-wise scatter plots i.e. of any two columns at a time

In [None]:
sns.pairplot(df)

Visualize and see roughly how many people have income below (and more than) 50K

In [None]:
ax= sns.countplot(data = df,
                  x = 'income',
                  hue='income' )

legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )

Check impact of work category on income bracket

In [None]:
ax= sns.countplot(data = df,
                  x = 'workclass',
                  hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)
plt.show()

Check impact of education on income bracket

In [None]:
ax= sns.countplot(data = df, x = 'education' , hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)
plt.show()

Check impact of marital status on income bracket

In [None]:
ax= sns.countplot(data = df,
                  x = 'marital-status',
                  hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)
plt.show()

Check impact of occupation on income bracket

In [None]:
ax= sns.countplot(data = df,
                  x = 'occupation',
                  hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)
plt.show()

Check impact of race on income bracket

In [None]:
ax= sns.countplot(data = df,
                  x = 'race',
                  hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)
plt.show()

Check impact of gender on income bracket

In [None]:
ax= sns.countplot(data = df,
                  x = 'sex',
                  hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)
plt.show()

Which country do most people (in the dataset) belong to? And which income bracket do they belong to?

In [None]:
plt.figure(figsize=(12,5))
ax= sns.countplot(data = df,
                  x = 'native-country',
                  hue = 'income')
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['<=50K','>50K'],
          bbox_to_anchor=(1,1)
          )
plt.xticks(rotation = 90)

plt.show()

Generate distribution plots for the following columns: "age", "capital-loss" and "capital-gain"

In [None]:
sns.displot(df['age'],
            kde = False)

In [None]:
sns.displot(df['capital-loss'],
            kde = False)

In [None]:
sns.displot(df['capital-gain'],
            kde = False)

Seprate the data points for cases where the income is greater than 50K, and Less than 50K. And then compare distribution plots for the "age" column.

In [None]:
gt_50 = df[df['income'] == 1]
ls_50 = df[df['income'] == 0]

In [None]:
sns.displot(gt_50['age'],
            kde = False)

In [None]:
sns.displot(ls_50['age'],
            kde = False)

In [None]:
plt.figure(figsize=(9,6))

plt.xlabel('Age (in Years)', fontsize=18)
plt.ylabel('Count', fontsize=18)

sns.displot(gt_50['age'], kde = False, label = '>50K')
sns.displot(ls_50['age'], kde = False, label = '<=50K')
plt.legend()

Is there any apparent relationship between the "capital-gain" and "capital-loss" columns?

In [None]:
sns.FacetGrid(df, hue="income", height=5).map(plt.scatter,
                                              "capital-loss",
                                              "capital-gain")  