In [71]:
import pandas as pd

In [72]:
df = pd.read_csv("../datasets/adult.data.csv")

In [73]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [74]:
# Print 5 first row of dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [75]:
# Number of people of each race are represented in dataset
race_count = df['race'].value_counts()
race_count

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [76]:
# Calculate average age of men (arthmetic mean) in dataset.
average_age_men = df[df['sex']=='Male']['age'].mean()
average_age_men

39.43354749885268

In [77]:
# calculate the percentage of people who have a Bachelor's degree
# In this case, We have not nan value in our dataset, that's mean df.shape[0] = df['education'].value_counts().sum()
education_counts = df['education'].value_counts()
percentage_bachelors = (education_counts['Bachelors']/df.shape[0])*100
percentage_bachelors

16.44605509658794

In [79]:
# Create make for advanced edution in dataset.
mask = ((df['education'] =='Masters')|(df['education'] =='Bachelors') |(df['education'] =='Doctorate'))

# Dataset of people with advanced education.
higher_education = df[mask]

# Dataset of people without advanced education.
lower_education = lower_education = df[~mask]

# The percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K
higher_education_rich = round(higher_education[higher_education['salary']=='>50K'].shape[0]/higher_education.shape[0]*100, 1)

# The percentage of people without advanced education make more than 50K
lower_education_rich = round(lower_education[lower_education['salary']=='>50K'].shape[0]/lower_education.shape[0]*100, 1)

print(f"Percentage of people with advanced education make more than 50K : {higher_education_rich:.2f}")
print(f"Percentage of people without advanced education make more than 50K : {lower_education_rich:.2f}")

Percentage of people with advanced education make more than 50K : 46.50
Percentage of people without advanced education make more than 50K : 17.40


In [18]:
# The minimum number of hours a person works per week
min_work_hours = df['hours-per-week'].min()
print(f"Minimum number of hours a person works per week is : {min_work_hours} hour(s)")

Minimum number of hours a person works per week is : 1 hour(s)


In [81]:
# Calculate percentage of the people who work the minimum number of hours per week have a salary of more than 50K
num_min_workers = (df[(df['hours-per-week'] == df['hours-per-week'].min())])

rich_percentage = (num_min_workers[num_min_workers['salary'] == '>50K'].shape[0]/num_min_workers.shape[0])*100
print(f'''Percentage of the people who work the minimum number of hours per week have a salary of more than 50K : \
{rich_percentage:.2f}
''')

Percentage of the people who work the minimum number of hours per week have a salary of more than 50K : 10.00



In [68]:
# The country has the highest percentage of people that earn >50K?
highest_earning_country = (
    df[df['salary'] == '>50K']['native-country'].value_counts()/
    df['native-country'].value_counts()).idxmax()
highest_earning_country_percentage = ((
    df[df['salary'] == '>50K']['native-country'].value_counts()/
    df['native-country'].value_counts()).max())*100
print(f'''The country has the highest percentage of people that earn >50K is \
'{highest_earning_country}' with {highest_earning_country_percentage:.2f} percent
''')

The country has the highest percentage of people that earn >50K is 'Iran' with 41.86 percent


In [70]:
# The most popular occupation for those who earn >50K in India.
top_IN_occupation = df[(df['salary']=='>50K') & (df['native-country']=='India')]['occupation'].value_counts().idxmax()
top_IN_occupation

'Prof-specialty'