**You must use Pandas to answer the following questions:**

1. [x] How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)
2. [x] What is the average age of men?
3. [x] What is the percentage of people who have a Bachelor's degree?
4. [x] What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
5. [x] What percentage of people without advanced education make more than 50K?
6. [x] What is the minimum number of hours a person works per week?
7. [x] What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
8. [x] What country has the highest percentage of people that earn >50K and what is that percentage?
9. [x] Identify the most popular occupation for those who earn >50K in India.

In [51]:
import pandas as pd

adult_pop_df = pd.read_csv('adult.data.csv')


In [2]:
adult_pop_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
adult_pop_df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)

In [4]:
adult_pop_df['race'].value_counts()

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

What is the average age of men?

In [5]:
adult_pop_df[['age', 'sex']]

Unnamed: 0,age,sex
0,39,Male
1,50,Male
2,38,Male
3,53,Male
4,28,Female
...,...,...
32556,27,Female
32557,40,Male
32558,58,Female
32559,22,Male


In [6]:
male_age_series = adult_pop_df.loc[adult_pop_df['sex'] == 'Male', ['age']]

In [7]:
round(male_age_series['age'].mean(), 1)

np.float64(39.4)

What is the percentage of people who have a Bachelor's degree?

In [8]:
adult_pop_df['education']

0         Bachelors
1         Bachelors
2           HS-grad
3              11th
4         Bachelors
            ...    
32556    Assoc-acdm
32557       HS-grad
32558       HS-grad
32559       HS-grad
32560       HS-grad
Name: education, Length: 32561, dtype: object

In [9]:
bach_degree_count = adult_pop_df.loc[adult_pop_df['education'] == 'Bachelors', 'education'].count()

In [10]:
edu_count = adult_pop_df['education'].count()

In [11]:
percent_degree = (bach_degree_count/edu_count)*100

In [12]:
percent_degree

np.float64(16.44605509658794)

What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [13]:
advanced_edu = ['Bachelors', 'Masters', 'Doctorate']

In [14]:
advanced_edu_df = adult_pop_df[adult_pop_df['education'].isin(advanced_edu)]

In [15]:
high_earners = advanced_edu_df[advanced_edu_df['salary'] == '>50K']

In [16]:
percentage = (len(high_earners) / len(advanced_edu_df)) * 100

In [17]:
percentage

46.535843011613935

What percentage of people without advanced education make more than 50K?

In [18]:
adult_pop_df['education'].drop_duplicates()


0         Bachelors
2           HS-grad
3              11th
5           Masters
6               9th
10     Some-college
13       Assoc-acdm
14        Assoc-voc
15          7th-8th
20        Doctorate
52      Prof-school
56          5th-6th
77             10th
160         1st-4th
224       Preschool
415            12th
Name: education, dtype: object

In [19]:
non_advanced_edu = adult_pop_df['education'].drop_duplicates()

In [20]:
non_advanced_edu = non_advanced_edu[~non_advanced_edu.isin(['Bachelors', 'Masters', 'Doctorate'])]


In [21]:
non_adv_edu_df = adult_pop_df[adult_pop_df['education'].isin(non_advanced_edu)]

In [22]:
another_high_earners = non_adv_edu_df[non_adv_edu_df['salary'] == '>50K']

In [23]:
percentage_non_edu = round((len(another_high_earners) / len(non_adv_edu_df)) * 100, 1)

In [24]:
percentage_non_edu

17.4

What is the minimum number of hours a person works per week?

In [25]:
min_hrs = min(adult_pop_df['hours-per-week'])

In [26]:
min_hrs_work_df = adult_pop_df[adult_pop_df['hours-per-week'] == min_hrs]

In [27]:
min_hrs_work_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
1036,66,Self-emp-inc,150726,9th,5,Married-civ-spouse,Exec-managerial,Husband,White,Male,1409,0,1,?,<=50K
1262,69,?,195779,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5590,78,?,363134,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5632,45,?,189564,Masters,14,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K
5766,62,?,97231,Some-college,10,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K
5808,76,?,211574,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,1,United-States,<=50K
8447,67,?,244122,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
9147,75,?,260543,10th,6,Widowed,?,Other-relative,Asian-Pac-Islander,Female,0,0,1,China,<=50K
11451,27,Private,147951,HS-grad,9,Never-married,Machine-op-inspct,Other-relative,White,Male,0,0,1,United-States,<=50K


In [28]:
min(adult_pop_df['hours-per-week'])

1

In [29]:
min_hrs_work_df = adult_pop_df[adult_pop_df['hours-per-week'] == min_hrs]


What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [30]:
min_for_50k = min_hrs_work_df[min_hrs_work_df['salary'] == '>50K']

In [31]:
min_for_50k

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
20072,65,?,76043,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,1,United-States,>50K


In [32]:
percentage_50k_min_hrs = (len(min_for_50k) / len(min_hrs_work_df)) * 100

In [33]:
percentage_50k_min_hrs

10.0

What country has the highest percentage of people that earn >50K and what is that percentage?

In [34]:
all_high_earners = adult_pop_df.loc[adult_pop_df['salary'] == '>50K', ['native-country', 'salary']]

In [35]:
all_high_earners_count = all_high_earners['native-country'].count()

In [36]:
countries_high_earning_df = all_high_earners[['native-country', 'salary']]

In [37]:
high_earn_count = all_high_earners['salary'].value_counts()

In [38]:
for country in countries_high_earning_df['native-country']:
    country_high_earners_count = countries_high_earning_df['native-country'].value_counts()


In [39]:
country_all_people_count = adult_pop_df['native-country'].value_counts()

In [40]:
country_high_earners_count.sort_values(ascending=False)

native-country
United-States         7171
?                      146
Philippines             61
Germany                 44
India                   40
Canada                  39
Mexico                  33
England                 30
Italy                   25
Cuba                    25
Japan                   24
Taiwan                  20
China                   20
Iran                    18
South                   16
Puerto-Rico             12
Poland                  12
France                  12
Jamaica                 10
El-Salvador              9
Greece                   8
Cambodia                 7
Yugoslavia               6
Hong                     6
Ireland                  5
Vietnam                  5
Portugal                 4
Haiti                    4
Ecuador                  4
Guatemala                3
Scotland                 3
Hungary                  3
Thailand                 3
Nicaragua                2
Trinadad&Tobago          2
Laos                     2
Columbia     

In [41]:
percentage_50K_top_country = round((country_high_earners_count/country_all_people_count)*100, 1)

In [42]:
percentage_50K_top_country.sort_values(ascending=False)

native-country
Iran                          41.9
France                        41.4
India                         40.0
Taiwan                        39.2
Japan                         38.7
Yugoslavia                    37.5
Cambodia                      36.8
Italy                         34.2
England                       33.3
Canada                        32.2
Germany                       32.1
Philippines                   30.8
Hong                          30.0
Greece                        27.6
China                         26.7
Cuba                          26.3
Scotland                      25.0
?                             25.0
United-States                 24.6
Hungary                       23.1
Ireland                       20.8
Poland                        20.0
South                         20.0
Thailand                      16.7
Ecuador                       14.3
Jamaica                       12.3
Laos                          11.1
Portugal                      10.8
Puert

In [43]:
round(percentage_50K_top_country.sort_values(ascending=False).head(1), 1)

native-country
Iran    41.9
Name: count, dtype: float64

In [44]:
percentage_50K_top_country.sort_values(ascending=False).head(1).iloc[0]


np.float64(41.9)

In [45]:
percentage_50K_top_country.sort_values(ascending=False).index[0]

'Iran'

Identify the most popular occupation for those who earn >50K in India.

In [46]:
India_50k_occupation = adult_pop_df.loc[adult_pop_df['native-country'] == 'India', ['salary', 'occupation',]]

In [47]:
high_earners_India = India_50k_occupation.loc[India_50k_occupation['salary'] == '>50K']

In [48]:
for occupation in high_earners_India:
    high_earners_occupations_count = high_earners_India['occupation'].value_counts()

In [49]:
top_occupation = high_earners_occupations_count.sort_values(ascending=False).head(1)

In [50]:
top_occupation.index[0]

'Prof-specialty'