In [2]:
import pandas as pd

Lets load a dataset of demographic data that was extracted from the 1994 Census database

In [3]:
df = pd.read_csv('adult.data.csv')

Lets inspect the data

In [32]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


Is any value NaN somewhere?

In [6]:
df.isnull().any()

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
salary            False
dtype: bool

What types of data do we work with

In [24]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

### How many of each race are represented in this dataset?

In [12]:
race_count = df['race'].value_counts()
race_count

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

### What is the average age of men?

In [96]:
average_age_men = df.loc[df['sex'] == 'Male', 'age'].mean().round(1)
average_age_men

39.4

### What is the percentage of people who have a Bachelor's degree?

In [28]:
percentage_bachelors = df.loc[df['education'] == 'Bachelors', 'education'].count() / df.shape[0]
percentage_bachelors

0.16446055096587942

### What is the percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`)?

In [62]:
higher_education = df.loc[df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])] 
num_higher_education = higher_education.shape[0]
num_higher_education/ df.shape[0]

0.23006050182733945

### What is the percentage of people **without** advanced education?

In [63]:
lower_education = df.loc[~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
num_lower_education = lower_education.shape[0]
num_lower_education / df.shape[0]

0.7699394981726605

### What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

In [127]:
higher_education_rich = higher_education.loc[higher_education['salary'] == '>50K'].shape[0] / num_higher_education 
higher_education_rich
# round(higher_education_rich*100, 1)

0.46535843011613937

### What percentage of people without advanced education make more than 50K?

In [67]:
lower_education_rich = lower_education.loc[df['salary'] == '>50K'].shape[0] / num_lower_education
lower_education_rich

0.173713601914639

### What is the minimum number of hours a person works per week?

In [48]:
min_work_hours = df['hours-per-week'].min()
min_work_hours

1

### What percentage of the people who work the minimum number of hours per week have a salary of >50K?

In [55]:
min_workers = df.loc[df['hours-per-week'] == min_work_hours]
num_min_workers = min_workers.shape[0]
num_min_workers

20

In [125]:
rich_percentage = min_workers.loc[df['salary'] == '>50K'].shape[0] / num_min_workers
rich_percentage

0.1

### What country has the highest percentage of people that earn >50K?

In [120]:
highest_earning_countries = df.loc[df['salary'] == '>50K', 'native-country'].value_counts() / df['native-country'].value_counts()
highest_earning_countries.sort_values(ascending=False, inplace=True)
highest_earning_country = highest_earning_countries.index[0]
highest_earning_country
# highest_earning_countries

'Iran'

In [126]:
highest_earning_country_percentage = highest_earning_countries[0] 
highest_earning_country_percentage

0.4186046511627907

### Identify the most popular occupation for those who earn >50K in India.

In [95]:
top_IN_occupations = df.loc[(df['salary'] == '>50K') & (df['native-country'] == 'India'), 'occupation'].value_counts()
top_IN_occupation = top_IN_occupations.index[0]
top_IN_occupation

'Prof-specialty'