# Challenge 2

In [None]:
import pandas as pd


def calculate_demographic_data(print_data=True):
  # Read data from file
  df = pd.read_csv('adult.data.csv')

  # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
  race_count = pd.Series(df.value_counts('race'))

  # What is the average age of men?
  average_age_men = (
      df.query('sex in ["Male"]').loc[:, ["sex", "age"]].mean()).round(1).values[0]

  # What is the percentage of people who have a Bachelor's degree?
  percentage_bachelors = pd.Series(df.value_counts(
      'education', normalize=True))["Bachelors"].round(3)*100

  # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
  # What percentage of people without advanced education make more than 50K?

  # with and without `Bachelors`, `Masters`, or `Doctorate`
  higher_education = None
  lower_education = None

  # percentage with salary >50K
  higher_education_rich = (df.query(
      'education in ["Bachelors", "Masters", "Doctorate"]').value_counts(
          'salary', normalize=True))[">50K"].round(3)*100

  lower_education_rich = (df.query(
      'education not in ["Bachelors", "Masters", "Doctorate"]').value_counts(
          'salary', normalize=True))[">50K"].round(3)*100

  # What is the minimum number of hours a person works per week (hours-per-week feature)?
  min_work_hours = (df.loc[:, ["hours-per-week"]].min()).round(1).values[0]

  # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
  num_min_workers = None

  rich_percentage = (df.query('`hours-per-week` in [1]').value_counts(
      'salary', normalize=True))[">50K"].round(3)*100

  # What country has the highest percentage of people that earn >50K?

  detail_aggr = (df.assign(country_sum=1).query('salary in [">50K"]').
                 loc[:, ["native-country", 'salary', "country_sum"]].groupby(
                     ["native-country", 'salary']).count())

  combined = (df.assign(
      country_sum=1).loc[:, ["native-country", "country_sum"]].groupby(
          ["native-country"]).count().merge(detail_aggr,
                                            left_on="native-country",
                                            right_on="native-country"))

  highest_earning_country = (combined.assign(
      proz=combined['country_sum_y'] / combined['country_sum_x']).sort_values(
          'proz', ascending=False)).index[0]
  highest_earning_country_percentage = (combined.assign(
      proz=combined['country_sum_y'] / combined['country_sum_x']).sort_values(
          'proz', ascending=False)).iloc[0, 2].round(3)*100

  # Identify the most popular occupation for those who earn >50K in India.
  top_IN_occupation = (df.assign(country_sum=1).query(
      'salary in [">50K"] & `native-country`in ["India"]'
  ).loc[:,
        ["native-country", "occupation"]].value_counts('occupation')).index[0]

  # DO NOT MODIFY BELOW THIS LINE

  if print_data:
    print("Number of each race:\n", race_count)
    print("Average age of men:", average_age_men)
    print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
    print(
        f"Percentage with higher education that earn >50K: {higher_education_rich}%"
    )
    print(
        f"Percentage without higher education that earn >50K: {lower_education_rich}%"
    )
    print(f"Min work time: {min_work_hours} hours/week")
    print(
        f"Percentage of rich among those who work fewest hours: {rich_percentage}%"
    )
    print("Country with highest percentage of rich:", highest_earning_country)
    print(
        f"Highest percentage of rich people in country: {highest_earning_country_percentage}%"
    )
    print("Top occupations in India:", top_IN_occupation)

  return {
      'race_count': race_count,
      'average_age_men': average_age_men,
      'percentage_bachelors': percentage_bachelors,
      'higher_education_rich': higher_education_rich,
      'lower_education_rich': lower_education_rich,
      'min_work_hours': min_work_hours,
      'rich_percentage': rich_percentage,
      'highest_earning_country': highest_earning_country,
      'highest_earning_country_percentage': highest_earning_country_percentage,
      'top_IN_occupation': top_IN_occupation
  }
