# Imports and Constants

In [None]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [None]:
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from helpers.paths import DATA_DIR

CLEAN_DATA_PATH = DATA_DIR / 'clean_data.pkl'

# Collect Data

In [None]:
with CLEAN_DATA_PATH.open(mode='rb') as pkl_file:
    data = pickle.load(pkl_file)
print(data.shape)
print('columns', data.columns)

# Analysis

## Constrict Data
To focus the analysis on this notebook due to time, going to focus on a certain subset of the data: **white, male, and from the United-States**.

In [None]:
focus_data = data[(data['race'] == 'White') & (data['country'] == 'United-States') & (data['sex'] == 'Male')]
focus_data = focus_data.drop(['race', 'sex', 'country'], axis=1)
print("Percentage of data this subgroup represents: {}%".format(focus_data.shape[0]/data.shape[0]*100))
focus_data.head()

## Pairplot 
- This output shows many feature comparisons all at once for initial learnings
- Here, we can visualize the distribution of all features
- It looks as though capital has some extreme outliers that can be examined.
- Can quickly visualize the number of hours per week drop off as individuals pass retirement age

In [None]:
sns.pairplot(focus_data)

## Correlations
- Plot from: https://seaborn.pydata.org/examples/many_pairwise_correlations.html
- No negative correlations were pulled out
- Strong positive correlations were between the number of years of schooling completed and whether an individual earned over 50k a year
- Age did not seem to be tied to number of years of school completed, likely because all participants were >= 17 years old
- The strong correlations between `over_50k` and other attributes would make it a strong candidate for an automl model

In [None]:
# Compute the correlation matrix
corr = focus_data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}).set_title("Correlation of white male Americans features")
plt.plot()