## Set Up

In [104]:
import pandas as pd

path = "./adult.data.csv"
df = pd.read_csv(path)

In [105]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

#### Q1: How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)

In [106]:
df.groupby("race")["race"].count().sort_values(ascending=False)

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

#### Q2: What is the average age of men?

In [11]:
df.groupby("sex")["age"].mean()

sex
Female    36.858230
Male      39.433547
Name: age, dtype: float64

#### Q3: What is the percentage of people who have a Bachelor's degree?

In [14]:
df.education.value_counts()

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [84]:
df[df.education == "Bachelors"].shape[0] / len(df) * 100

16.292451060103822

#### Q4: What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [107]:
df.salary.unique()

array(['<=50K', '>50K'], dtype=object)

In [130]:
# percentage of higher edu 
higher_edu = df[df.education.isin(["Bachelors", "Masters", "Doctorate"])] 
higher_edu_count = higher_edu.shape[0] 
higher_edu_count / df.shape[0] * 100

23.006050182733944

In [150]:
# percentage of higher edu and >50k
higher_edu_rich = df[(df.education.isin(["Bachelors", "Masters", "Doctorate"])) & (df.salary == ">50K")]
higher_edu_rich.shape[0] / higher_edu_count * 100

46.535843011613935

#### Q5: What percentage of people without advanced education make more than 50K?

In [152]:
# percentage of lower edu
lower_edu = df[~df.education.isin(["Bachelors", "Masters", "Doctorate"])]
lower_edu_count = lower_edu.shape[0]
lower_edu_count

25070

In [153]:
# percentage of lower edu and >50k
lower_edu_rich = df[(~df.education.isin(["Bachelors", "Masters", "Doctorate"])) & (df.salary == ">50K")]
lower_edu_rich.shape[0] / lower_edu_count * 100

17.3713601914639

#### Q6: What is the minimum number of hours a person works per week?

In [43]:
min_hour_per_week = df["hours-per-week"].min()
print(min_hour_per_week)

1


#### Q7: What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [155]:
work_min_hr = df[df["hours-per-week"] == min_hour_per_week]
work_min_hr_count = work_min_hr.shape[0] 
work_min_hr_count

20

In [156]:
df_min_hour_above_50k = work_min_hr[ work_min_hr.salary == ">50K" ]
df_min_hour_above_50k.shape[0] / work_min_hr_count * 100

10.0

#### Q8: What country has the highest percentage of people that earn >50K and what is that percentage?

In [185]:
# number of ppl per country
ppl_country = df.groupby("native-country")["salary"].count()
ppl_country.sort_values(ascending=False, inplace=True)

# number of ppl with >50k per country
more_than_50k = df[df.salary == ">50k"]
more_than_50k_per_country = more_50k.groupby("native-country")["salary"].count()
more_than_50k_per_country.sort_values(ascending=False, inplace=True)

# percentage
p = more_than_50k_per_country / ppl_country * 100
p.sort_values(ascending=False, inplace=True)

In [186]:
p.head()

native-country
Iran      41.860465
France    41.379310
India     40.000000
Taiwan    39.215686
Japan     38.709677
Name: salary, dtype: float64

In [178]:
more_than_50k_per_country

native-country
United-States         7171
?                      146
Philippines             61
Germany                 44
India                   40
Canada                  39
Mexico                  33
England                 30
Italy                   25
Cuba                    25
Japan                   24
China                   20
Taiwan                  20
Iran                    18
South                   16
Poland                  12
France                  12
Puerto-Rico             12
Jamaica                 10
El-Salvador              9
Greece                   8
Cambodia                 7
Yugoslavia               6
Hong                     6
Ireland                  5
Vietnam                  5
Ecuador                  4
Haiti                    4
Portugal                 4
Scotland                 3
Thailand                 3
Hungary                  3
Guatemala                3
Laos                     2
Dominican-Republic       2
Peru                     2
Trinadad&Toba

#### Q9: Identify the most popular occupation for those who earn >50K in India.

In [79]:
filtered = df[ (df["native-country"] == "India") & (df.salary == ">50K")]
filtered = filtered.groupby("occupation")["occupation"].count()
filtered.sort_values(inplace=True, ascending=False)
filtered

occupation
Prof-specialty      25
Exec-managerial      8
Other-service        2
Tech-support         2
Adm-clerical         1
Sales                1
Transport-moving     1
Name: occupation, dtype: int64