In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('adult.data.csv')

In [3]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

### How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)

In [4]:
# solution 01 || complicated
race_data = df.groupby('race')['age'].count().sort_values()
grouped_race = pd.Series(data=race_data, index=df['race'].drop_duplicates())
grouped_race

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: age, dtype: int64

In [5]:
# solution 02 || simplified
df['race'].value_counts()

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

### What is the average age of men?

In [6]:
# solution 01
round(df.loc[df['sex'] == 'Male']['age'].mean(), 1)

39.4

In [7]:
# solution 02
df[df['sex'] == 'Male']['age'].mean().round(1)

39.4

### What is the percentage of people who have a Bachelor's degree?

In [8]:
# solution 01
total = df['education'].size
bachelors = df[df['education'] == 'Bachelors'].count().iloc[0]
print(f'{round((bachelors / total) * 100, 1)}%')

16.4%


In [9]:
# solution 02
round((df['education'] == 'Bachelors').mean() * 100, 1)

16.4

### What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [10]:
# solution 01 || not the right answer
edu_list = ['Bachelors' ,'Masters' ,'Doctorate']
advanced = df['education'].isin(edu_list)
top_salary = df['salary'] == '>50K'

per = df[advanced & top_salary].shape[0]
print(f'{round((per / total) * 100, 2)}%')

10.71%


In [11]:
# solution 02 || not the right answer
df[df['education'].isin(edu_list) & top_salary].value_counts().shape[0] / df.shape[0]

0.10706059396210191

In [12]:
# solution 03 || the right one
top_salary = df['salary'] == '>50K'
edu_list = ['Bachelors' ,'Masters' ,'Doctorate']

# with and without `Bachelors`, `Masters`, or `Doctorate`
higher_education = df['education'].isin(edu_list)
lower_education = ~df['education'].isin(edu_list)

# percentage with salary >50K 
higher_education_rich = round((df[higher_education & top_salary].shape[0] / df[higher_education].shape[0]) * 100, 1)
lower_education_rich = round((df[lower_education & top_salary].shape[0] / df[lower_education].shape[0]) * 100, 1)

In [13]:
higher_education_rich

46.5

### What percentage of people without advanced education make more than 50K?

In [14]:
lower_education_rich

17.4

### What is the minimum number of hours a person works per week?

In [15]:
df['hours-per-week'].min()

1

### What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [21]:
# solution 01 || wrong answer
min_hours = df['hours-per-week'] == df['hours-per-week'].min()
min_hours_sal = df[min_hours & top_salary].shape[0]
print(f'{round((min_hours_sal / total) * 100, 2)}%')

0.01%


In [27]:
# solution 02 || the right one
num_min_workers = df['hours-per-week'] == df['hours-per-week'].min()
rich_percentage = (df[num_min_workers & top_salary].shape[0] / df[num_min_workers].shape[0]) * 100
rich_percentage

10.0

### What country has the highest percentage of people that earn >50K and what is that percentage?

In [28]:
# solution 01 || wrong answer
df[top_salary].groupby('native-country')['salary'].count().sort_values(ascending=False)

native-country
United-States         7171
?                      146
Philippines             61
Germany                 44
India                   40
Canada                  39
Mexico                  33
England                 30
Italy                   25
Cuba                    25
Japan                   24
China                   20
Taiwan                  20
Iran                    18
South                   16
Poland                  12
France                  12
Puerto-Rico             12
Jamaica                 10
El-Salvador              9
Greece                   8
Cambodia                 7
Yugoslavia               6
Hong                     6
Ireland                  5
Vietnam                  5
Ecuador                  4
Haiti                    4
Portugal                 4
Scotland                 3
Thailand                 3
Hungary                  3
Guatemala                3
Laos                     2
Dominican-Republic       2
Peru                     2
Trinadad&Toba

In [30]:
# solution 02 || right answer
earning_country = ((df[top_salary].groupby('native-country')['salary'].count() \
      / df['native-country'].value_counts()) * 100).sort_values(ascending=False)
highest_earning_country_percentage = earning_country.iloc[0].round(1)

In [None]:
earning_country.index[0]

In [None]:
highest_earning_country_percentage

### Identify the most popular occupation for those who earn >50K in India.

In [31]:
# solution 01
india = df['native-country'] == 'India'
filtered = df[top_salary & india]
filtered.groupby('occupation').count()['salary'].sort_values(ascending=False).index[0]

'Prof-specialty'

In [None]:
# solution 02
df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]['occupation'].value_counts().index[0]