In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Import the dataset
df = pd.read_csv('adultdata.csv')

In [3]:
# Get a glimpse of the dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Pleriminary information on columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
# Descriptive statistics
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
# Count of people by race
race = df['race'].value_counts()
race

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [7]:
# Average age of men
men = df[df['sex'] == 'Male'] # this filters all the men in the dataframe
average_age = men['age'].mean()
print('The average age of men is {}'.format(average_age))

The average age of men is 39.43354749885268


In [8]:
# The percentage of people with a bachelor's degree
bachelor_degree = df[df['education'] == 'Bachelors']
percentage_1 = (len(bachelor_degree) / len(df)) * 100
print("The percentage of people with a bachelor degree is {} %".format(percentage_1))

The percentage of people with a bachelor degree is 16.44605509658794 %


In [9]:
# The percentage of people with advanced education who make more than 50k
advanced_education = ['Bachelors', 'Masters', 'Doctorate']
people = df[(df['education'].isin(advanced_education)) & (df['salary'] == '>50K')]
percentage_2 = (len(people) / len(df)) * 100
print('The percentage of people with advanced education who who make more than 50K is {} % '.format(percentage_2))

The percentage of people with advanced education who who make more than 50K is 10.706059396210192 % 


In [10]:
# The percentage of people without advanced education who make more than 50k
people_1 = df[~(df['education'].isin(advanced_education)) & (df['salary'] == '>50K')]
percentage_3 = (len(people_1) / len(df)) * 100
print('The percentage of people without advanced education and who make more than 50K is {} %'.format(percentage_3))

The percentage of people without advanced education and who make more than 50K is 13.374896348392248 %


In [11]:
# Minimum working hours per week
minimum = df['hours-per-week'].min()
print('The minimum working hours per week is {} hour/s per week'.format(minimum))

The minimum working hours per week is 1 hour/s per week


In [12]:
# Percentage of people who work minimum hours per week and make more than 50K
people_2 = df[(df['hours-per-week'] == minimum) & (df['salary'] == '>50K')]
percentage_4 = (len(people_2) / len(df)) * 100
print("The percentage of people who work work minimum hours per week and earn more than 50K is {}".format(percentage_4))

The percentage of people who work work minimum hours per week and earn more than 50K is 0.006142317496391388


In [13]:
# The country with the highest percentage of people who earn more than 50K
high_salaries = df[df['salary'] == '>50K']
groups = high_salaries.groupby('native-country')
percentage_5 = (groups.size() / df['native-country'].value_counts()) * 100
results = percentage_5.sort_values(ascending=False) # This sorts our results from highest to lowest
print(results)

Iran                          41.860465
France                        41.379310
India                         40.000000
Taiwan                        39.215686
Japan                         38.709677
Yugoslavia                    37.500000
Cambodia                      36.842105
Italy                         34.246575
England                       33.333333
Canada                        32.231405
Germany                       32.116788
Philippines                   30.808081
Hong                          30.000000
Greece                        27.586207
China                         26.666667
Cuba                          26.315789
?                             25.042882
Scotland                      25.000000
United-States                 24.583476
Hungary                       23.076923
Ireland                       20.833333
South                         20.000000
Poland                        20.000000
Thailand                      16.666667
Ecuador                       14.285714


In [14]:
# Occupations of people who earn more than 50K in India
high_salary_indians = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
occupations = high_salary_indians['occupation'].value_counts()
occupations

Prof-specialty      25
Exec-managerial      8
Other-service        2
Tech-support         2
Transport-moving     1
Sales                1
Adm-clerical         1
Name: occupation, dtype: int64