In [None]:
import pandas as pd
import numpy  as np
import re
import matplotlib.pyplot as plt

%matplotlib inline

### Description
We will be using the Adult dataset, which has the following attributes (columns):

* age: continuous.
* workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
* fnlwgt: continuous.
* education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 
* 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* education-num: continuous.
* marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-
spouse.
* occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, 
Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
* race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
* sex: Female, Male.
* capital-gain: continuous.
* capital-loss: continuous.
* hours-per-week: continuous.
* native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, 
Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.* * 

### Load the data:

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, sep=', ')
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status','occupation', 'relationship',
              'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
len(df)

Let's have a look at the data:

In [None]:
df.head()

In [None]:
df.describe()

### Are there any missing data?

In [None]:
df.isnull().sum()

### What is the age demographic in this data? Plot it.

In [None]:
df.age.hist(bins=len(df.age.unique()), grid=False)

### How many types of education are in this data set?

In [None]:
df.education.unique()

### What is the average age of women?

In [None]:
1.0*sum(df[df.sex == 'Female']['age'])/len(df[df.sex == 'Female'])

### What countries are these employes from?

In [None]:
df.native_country.value_counts().head(10)

### What is the percentage of United-States citizens?

In [None]:
100.0*df.native_country.value_counts()['United-States']/df.shape[0]

### Which are the bottom 10 represented countries in the dataset?

In [None]:
df_cntry_filtered = df[df.native_country != '?']
cntry_prop = 100.0*df_cntry_filtered.native_country.value_counts()/df_cntry_filtered.shape[0]
cntry_prop.sort_values(inplace=True, ascending = False)
cntry_prop[-10:].plot(kind = 'bar')

### Map and Apply

Difference between Map and Apply

In [None]:
def married_only(x):
    return x.startswith('Married')
    
married = df[df.marital_status.map(married_only)]
married.head()

In [None]:
capital = df[[col for col in df.columns if re.search('^capital_.*$', col)]]
capital.head()

In [None]:
capital.apply(sum)

In [None]:
capital.apply(np.sqrt).head()

In [None]:
capital.apply(lambda x: x - 1).head()

### Top 10 countries where more than 5% of the women work more than 40 works per week

In [None]:
females_in_df = df[df.sex == 'Female']
females_in_df_bad = females_in_df[females_in_df['hours_per_week'] > 40]

females_per_cnt     = females_in_df['native_country'].value_counts()
females_per_cnt_bad = females_in_df_bad['native_country'].value_counts()

temp_ser = pd.Series(index = females_in_df_bad_cnt.index)
for cntry in females_in_df_bad_cnt.index:
    temp_ser[cntry] = 100.0*females_in_df_bad_cnt[cntry]/females_in_df_cnt[cntry]

temp_ser[temp_ser > 5].sort_values(ascending = False)[:10].plot(kind = 'bar')


### Do married Females make less than unmarried Females?

In [None]:
bsc = df[df.sex == 'Female']

unq_marital_status = bsc.marital_status.unique()
status_inc_df = pd.DataFrame(index = unq_marital_status, columns = ['<=50K', '>50K'])

for status in unq_marital_status:
    status_bool = bsc.marital_status == status
    status_vc =  bsc[status_bool]['income'].value_counts()
    status_inc_df.loc[status] = status_vc[['<=50K', '>50K']]
    
status_inc_df = status_inc_df.applymap(lambda x: 0 if np.isnan(x) else x)
status_inc_df.plot(kind = 'barh'); plt.ylabel('marital_status')

### Plot the proportion of people making more than $50k a year by age 

In [None]:
unq_age = np.unique(df.age);
age_inc_df = pd.DataFrame(index = unq_age, columns = ['<=50K', '>50K'])

for age in unq_age:
    age_bool = df.age == age
    age_vc =  df[age_bool]['income'].value_counts()
    age_inc_df.loc[age] = age_vc[['<=50K', '>50K']]
    
age_inc_df = age_inc_df.applymap(lambda x: 0 if np.isnan(x) else x)
age_inc_df_prop = age_inc_df.apply(lambda row: row/sum(row), axis = 1)
age_inc_df_prop['>50K'].plot(); plt.xlabel('age')