In [1]:
import pandas as pd
import numpy as np

In [152]:
df = pd.read_csv('adult.data.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### 1. How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)

In [29]:
race = pd.Series(df['race']) #creates the series
print(race.unique())
print('-----------------------------------------------------------------')

w_count = 0
b_count = 0
a_count = 0
ie_count = 0
o_count = 0

#iterating through the series - LEAST EFFICIENT WAY TO SET THIS UP - Note this does not turn it into a Pandas series
for i in race:
    if i == 'White':
        w_count += 1
    if i == 'Black':
        b_count += 1
    if i == 'Asian-Pac-Islander':
        a_count += 1
    if i == 'Amer-Indian-Eskimo':
        ie_count += 1
    if i == 'Other':
        o_count += 1

print(f'White: {w_count}\nBlack: {b_count}\nAPI: {a_count}\nAIE: {ie_count}\nOther: {o_count}')
print('-----------------------------------------------------------------')

#Using booleans/True-False mask == BETTER WAY - Note this does not turn it into a Pandas series
for i in race.unique():    
    print(f'{i}: {race[race == i].count()}')
    
print('-----------------------------------------------------------------')

#EVEN EASIER USING BUILT-IN FUNCTION - CREATES THE NEEDED PANDAS SERIES
race_series = pd.Series(race.value_counts())
race_series.to_frame()
#OR
race_series = pd.Series(df['race'].value_counts()) #pulls from the entire database, not just the pre-made race-only series
race_series.to_frame()

['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
-----------------------------------------------------------------
White: 27816
Black: 3124
API: 1039
AIE: 311
Other: 271
-----------------------------------------------------------------
White: 27816
Black: 3124
Asian-Pac-Islander: 1039
Amer-Indian-Eskimo: 311
Other: 271
-----------------------------------------------------------------


Unnamed: 0,race
White,27816
Black,3124
Asian-Pac-Islander,1039
Amer-Indian-Eskimo,311
Other,271


#### 2. What is the average age of men?

In [42]:
male_only_df = df[df['sex'] == 'Male'] #Copies the full data frame into one that only includes values where 'Male' is true
male_ages = pd.Series(male_only_df['age']) #Creates a series of the ages in the dataset, now that only men are in it
print(f'Average age of men in the dataset: {male_ages.mean()}')

male_only_df['sex'].value_counts() #CHECKS TO MAKE SURE THAT THE MALE ONLY DATAFRAME TRULY IS MALE-ONLY, 'Male' is returned as the only index == GOOD

Average age of men in the dataset: 39.43354749885268


Male    21790
Name: sex, dtype: int64

#### 3. What is the percentage of people who have a Bachelor's degree?

In [51]:
education = pd.Series(df['education'].value_counts()) #makes a count of people with all the different degrees in the dataset

bach_degs = round(education['Bachelors']/education.sum() * 100, 2)
print(f"Percentage of people with Bachelor's degrees: {bach_degs}%")

education

Percentage of people with Bachelor's degrees: 16.45%


HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

#### 4. What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [113]:
adv_ed_df = df[((df['education'] == 'Bachelors') | (df['education'] == 'Masters') | (df['education'] == 'Doctorate'))] #creates dataframe of people with advanced degrees
adv_over_50K = adv_ed_df[adv_ed_df['salary'] == '>50K'] #creates dataframe of people with advanced degrees and salaries over 50K

advanced_education_over_50K_percent = round( (len(adv_over_50K.index))/(len(adv_ed_df.index)) * 100, 2)
print(f"Percentage of people with advanced education making >$50K: {advanced_education_over_50K_percent}%")

Percentage of people with advanced education making >$50K: 46.54%


#### 5. What percentage of people without advanced education make more than 50K?

In [114]:
non_adv_ed_df = df[((df['education'] != 'Bachelors') & (df['education'] != 'Masters') & (df['education'] != 'Doctorate'))] #dataframe of people without advanced degrees
non_adv_over_50K = df[((df['education'] != 'Bachelors') & (df['education'] != 'Masters') & (df['education'] != 'Doctorate')) & (df['salary'] == '>50K')] #df of people also making over 50K

non_advanced_education_over_50K_percent = round( (len(non_adv_over_50K.index))/(len(non_adv_ed_df.index)) * 100, 2)

print(f"Percentage of people without advanced education making >$50K: {non_advanced_education_over_50K_percent}%")

Percentage of people without advanced education making >$50K: 17.37%


#### 6. What is the minimum number of hours a person works per week?

In [123]:
hrs_worked = df['hours-per-week'] #Creates a series of hours worked per week
min_hrs_worked = hrs_worked.min() #Pulls the mean from the series

min_hrs_worked = df['hours-per-week'].min() #This can be done in one line of code this way

print(f'Minimum hours worked by a person per week: {min_hrs_worked} hour(s).')

Minimum hours worked by a person per week: 1 hour(s).


#### 7. What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [128]:
percent_min_hours_and_over_50K = round( ((len((df[(df['hours-per-week'] == 1) & (df['salary'] == '>50K')].index))) / (len((df[df['hours-per-week'] == 1].index)))) * 100, 2)
#This stuff was all done on one line. See above to see how to do it in a more piece-by-piece fashion

print(f'Percentage of people working the minimum hours per week and having a salary >50K: {percent_min_hours_and_over_50K}%')

Percentage of people working the minimum hours per week and having a salary >50K: 10.0%


#### 8. What country has the highest percentage of people that earn >50K and what is that percentage?

In [145]:
over_50K = df[df['salary'] == '>50K'] #creates dataframe of people making more than 50K
countries_over_50K = over_50K['native-country'] #creates series of those people based on their country of origin
countries_over_50K_series = countries.value_counts() #lists the unique values in the series and their counts
#OR 
countries_over_50K_series = (df[df['salary'] == '>50K'])['native-country'].value_counts() #This is it all in one line

countries_total_series = df['native-country'].value_counts()
#print(countries_over_50K_series)
#print(countries_total_series)

percentages_over_50K = countries_over_50K_series / countries_total_series
percentages_over_50K[percentages_over_50K == percentages_over_50K.max()]

Iran    0.418605
Name: native-country, dtype: float64

#### 9. Identify the most popular occupation for those who earn >50K in India.

In [151]:
occupation_series = (df[(df['salary'] == '>50K') & (df['native-country'] == 'India')])['occupation'].value_counts()

print(occupation_series[0:1]) #This is the most popular one.

occupation_series

Prof-specialty    25
Name: occupation, dtype: int64


Prof-specialty      25
Exec-managerial      8
Other-service        2
Tech-support         2
Transport-moving     1
Sales                1
Adm-clerical         1
Name: occupation, dtype: int64