In [5]:
#import libraries
import pandas as pd


In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race',
                'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'salary']
# Read the CSV file
df = pd.read_csv(url, header=None, names=column_names, na_values=' ?', skipinitialspace=True)


In [3]:
# 1. How many people of each race are represented in this dataset?
race_count = df['race'].value_counts()

# 2. What is the average age of men?
average_age_men = df[df['sex'] == 'Male']['age'].mean()

# 3. What is the percentage of people who have a Bachelor's degree?
bachelors_percentage = (df['education'] == 'Bachelors').mean() * 100

# 4. What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
advanced_education = df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
percentage_advanced_salary = (df[advanced_education]['salary'] == '>50K').mean() * 100

# 5. What percentage of people without advanced education make more than 50K?
non_advanced_education = ~advanced_education
percentage_non_advanced_salary = (df[non_advanced_education]['salary'] == '>50K').mean() * 100

# 6. What is the minimum number of hours a person works per week?
min_hours_per_week = df['hours-per-week'].min()

# 7. What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
min_hours_salary_percentage = (df[df['hours-per-week'] == min_hours_per_week]['salary'] == '>50K').mean() * 100

# 8. What country has the highest percentage of people that earn >50K and what is that percentage?
country_salary_count = df[df['salary'] == '>50K']['native-country'].value_counts(normalize=True) * 100
highest_earning_country = country_salary_count.idxmax()
highest_percentage = country_salary_count.max()

# 9. Identify the most popular occupation for those who earn >50K in India.
most_popular_occupation_india = df[(df['salary'] == '>50K') & (df['native-country'] == 'India')]['occupation'].value_counts().idxmax()

# Output results
print("Race count:\n", race_count)
print("Average age of men:", average_age_men)
print("Percentage of people with Bachelor's degree:", round(bachelors_percentage, 1))
print("Percentage of people with advanced education earning >50K:", round(percentage_advanced_salary, 1))
print("Percentage of people without advanced education earning >50K:", round(percentage_non_advanced_salary, 1))
print("Minimum hours per week:", min_hours_per_week)
print("Percentage of minimum hours worked earning >50K:", round(min_hours_salary_percentage, 1))
print("Country with highest percentage earning >50K:", highest_earning_country, "with percentage:", round(highest_percentage, 1))
print("Most popular occupation for those earning >50K in India:", most_popular_occupation_india)


Race count:
 race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64
Average age of men: 39.43354749885268
Percentage of people with Bachelor's degree: 16.4
Percentage of people with advanced education earning >50K: 46.5
Percentage of people without advanced education earning >50K: 17.4
Minimum hours per week: 1
Percentage of minimum hours worked earning >50K: 10.0
Country with highest percentage earning >50K: United-States with percentage: 91.5
Most popular occupation for those earning >50K in India: Prof-specialty
