1. How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)
2. What is the average age of men?
3. What is the percentage of people who have a Bachelor's degree?
4. What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
5. What percentage of people without advanced education make more than 50K?
6. What is the minimum number of hours a person works per week?
7. What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
8. What country has the highest percentage of people that earn >50K and what is that percentage?
9. Identify the most popular occupation for those who earn >50K in India.

Data from: https://archive.ics.uci.edu/dataset/20/census+income

In [16]:
import numpy as np
import pandas as pd

adultData = '/census+income/adult.data'
colHeader = ['age', 'workclass', 'fnlwgt', "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "captial_gain", "capital_loss", "hours_per_week", "native_country", "label"]

In [35]:
# The following gives the rows with None, there are no such rows.
df[df.isna().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,captial_gain,capital_loss,hours_per_week,native_country,label


In [111]:
def calculate_demographic_data(filePath, headers,print_data=True):
    # Read data from file
    df = pd.read_csv(filePath, sep=',', names=headers)

    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    race_count = pd.Series(df['race'].unique())

    # What is the average age of men?
    average_age_men = np.average(df[df['sex']==' Male']['age'])

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = len(df[df['education']==' Bachelors'].index) / len(df.index) * 100

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    advancedEd = df[(df['education']==' Bachelors') | (df['education']==' Masters') | (df['education']==' Doctorate')]
    advPerc = len(advancedEd[advancedEd['label']==' >50K']) /  len(df) * 100
    
    # What percentage of people without advanced education make more than 50K?
    nonAdv = df[(df['education']!=' Bachelors') & (df['education']!=' Masters') & (df['education']!=' Doctorate')] 
    nonAdvPerc = len(nonAdv[nonAdv['label']==' >50K']) / len(df) *  100
    
    # with and without `Bachelors`, `Masters`, or `Doctorate`
    higher_education = advancedEd
    lower_education = nonAdv

    # percentage with salary >50K
    higher_education_rich = advPerc
    lower_education_rich = nonAdvPerc

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    min_work_hours = df['hours_per_week'].min()

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    num_min_workers = df[df['hours_per_week']==min_work_hours]
    
    rich_percentage = len(workersMinHours[workersMinHours['label']==' >50K']) / len(df) * 100
    
    # What country has the highest percentage of people that earn >50K?
    salary50 = df[df['label']==' >50K']
    country50 = salary50['native_country'].value_counts()
    highest_earning_country = country50.idxmax()
    highest_earning_country_percentage = country50.iloc[np.argmax(country50)] / len(df) *  100

    # Identify the most popular occupation for those who earn >50K in India
    india50 = df[(df['native_country']==' India') & (df['label']==' >50K')]
    occupations = india50['occupation'].value_counts()
    top_IN_occupation = occupations.idxmax()

    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        print("Number of each race:\n", race_count) 
        print("Average age of men:", average_age_men)
        print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
        print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
        print(f"Min work time: {min_work_hours} hours/week")
        print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
        print("Country with highest percentage of rich:", highest_earning_country)
        print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
        print("Top occupations in India:", top_IN_occupation)

    return {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'higher_education_rich': higher_education_rich,
        'lower_education_rich': lower_education_rich,
        'min_work_hours': min_work_hours,
        'rich_percentage': rich_percentage,
        'highest_earning_country': highest_earning_country,
        'highest_earning_country_percentage':
        highest_earning_country_percentage,
        'top_IN_occupation': top_IN_occupation
    }


In [112]:
calculate_demographic_data(adultData, colHeader)

Number of each race:
 0                  White
1                  Black
2     Asian-Pac-Islander
3     Amer-Indian-Eskimo
4                  Other
dtype: object
Average age of men: 39.43354749885268
Percentage with Bachelors degrees: 16.44605509658794%
Percentage with higher education that earn >50K: 10.706059396210192%
Percentage without higher education that earn >50K: 13.374896348392248%
Min work time: 1 hours/week
Percentage of rich among those who work fewest hours: 0.006142317496391388%
Country with highest percentage of rich:  United-States
Highest percentage of rich people in country: 22.023279383311323%
Top occupations in India:  Prof-specialty


{'race_count': 0                  White
 1                  Black
 2     Asian-Pac-Islander
 3     Amer-Indian-Eskimo
 4                  Other
 dtype: object,
 'average_age_men': 39.43354749885268,
 'percentage_bachelors': 16.44605509658794,
 'higher_education_rich': 10.706059396210192,
 'lower_education_rich': 13.374896348392248,
 'min_work_hours': 1,
 'rich_percentage': 0.006142317496391388,
 'highest_earning_country': ' United-States',
 'highest_earning_country_percentage': 22.023279383311323,
 'top_IN_occupation': ' Prof-specialty'}

In [None]:
# Used for testing
# df = pd.read_csv(adultData, sep=',', names=colHeader)
# df
# type(df['race'].unique())
# pd.Series(df['race'].unique())
# np.average?
# np.average(df[df['sex']==' Male']['age'])
# len(df[df['education']==' Bachelors'].index) / len(df.index) * 100
# advancedEd = df[(df['education']==' Bachelors') | (df['education']==' Masters') | (df['education']==' Doctorate')]
# len(advancedEd[advancedEd['label']==' >50K'].index)
# len(df)
# df['education'].unique()
# df['label'].unique()

# nonAdv = df[(df['education']!=' Bachelors') & (df['education']!=' Masters') & (df['education']!=' Doctorate')] 
# nonAdv[nonAdv['label']==' >50K']

# minHours = df['hours_per_week'].min()
# workersMinHours = df[df['hours_per_week']==minHours]
# workersMinHours[workersMinHours['label']==' >50K']

# salary50 = df[df['label']==' >50K']
# country50 = salary50['native_country'].value_counts()
# country50
# country50.idxmax()
# country50[np.argmax(country50)]

# india50 = df[(df['native_country']==' India') & (df['label']==' >50K')]
# india50['occupation'].value_counts()

# min_work_hours = df['hours_per_week'].min()
# num_min_workers = df[df['hours_per_week']==min_work_hours]
# rich_percentage = len(workersMinHours[workersMinHours['label']==' >50K']) / len(df) * 100
# rich_percentage