## Imported Libraries

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy.stats import ttest_ind
pd.set_option("display.max_rows", 200)

## Read CSV File

In [55]:
df = pd.read_csv("FIES PUF 2012 Vol.1.CSV")
df

Unnamed: 0,W_REGN,W_OID,W_SHSN,W_HCN,URB,RSTR,PSU,BWEIGHT,RFACT,FSIZE,...,PC_QTY,OVEN_QTY,MOTOR_BANCA_QTY,MOTORCYCLE_QTY,POP_ADJ,PCINC,NATPC,NATDC,REGDC,REGPC
0,14,101001000,2,25,2,21100,415052,138.25,200.6576,3.0,...,01,01,,,0.946172,108417.00,9,8,8,9
1,14,101001000,3,43,2,21100,415052,138.25,200.6576,12.5,...,,01,,01,0.946172,30631.60,5,9,9,4
2,14,101001000,4,62,2,21100,415052,138.25,200.6576,2.0,...,,01,,,0.946172,86992.50,9,6,6,8
3,14,101001000,5,79,2,21100,415052,138.25,200.6576,4.0,...,,01,,,0.946172,43325.75,6,6,6,6
4,14,101001000,10,165,2,21100,415052,138.25,200.6576,5.0,...,,,,01,0.946172,37481.80,6,6,6,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40166,12,9804029001,18,568,1,22000,114062,271.25,963.2264,5.0,...,00,00,00,00,0.902863,30101.60,5,5,7,7
40167,12,9804035000,1,25,2,22000,414067,271.25,588.6253,9.0,...,00,01,00,00,0.902863,14368.89,1,5,7,3
40168,12,9804035000,2,51,2,22000,414067,271.25,588.6253,6.0,...,00,00,00,00,0.902863,19137.33,3,4,6,4
40169,12,9804035000,3,75,2,22000,414067,271.25,588.6253,5.0,...,00,01,00,00,0.902863,30985.00,5,6,7,7


## Data Cleaning
* W_REGN
* NONAGRI_SAL
* EMPLOYED_PAY
* OCCUP
* JOB




In [78]:
# Get only people who have jobs
selected_df = df[["W_REGN","W_OID","W_SHSN","W_HCN","NONAGRI_SAL","EMPLOYED_PAY","OCCUP","JOB","SEX"]]

In [79]:
selected_df = selected_df.drop_duplicates()

In [80]:
selected_df = selected_df.dropna(subset=["OCCUP"])

In [81]:
selected_df = selected_df[selected_df["JOB"] != 2]

In [82]:
selected_df = selected_df[selected_df["NONAGRI_SAL"] > 0]


In [83]:
selected_df.loc[:, "EMPLOYED_PAY"] = pd.to_numeric(selected_df["EMPLOYED_PAY"], errors='coerce')
selected_df.loc[:, "EMPLOYED_PAY"] = selected_df["EMPLOYED_PAY"].fillna(0).astype(int)
employed_df = selected_df[["EMPLOYED_PAY"]]

  selected_df.loc[:, "EMPLOYED_PAY"] = selected_df["EMPLOYED_PAY"].fillna(0).astype(int)


In [84]:
selected_df = selected_df[selected_df["EMPLOYED_PAY"] == 1]

In [85]:
clean_df = selected_df
clean_df

Unnamed: 0,W_REGN,W_OID,W_SHSN,W_HCN,NONAGRI_SAL,EMPLOYED_PAY,OCCUP,JOB,SEX
4,14,101001000,10,165,32000,1,5220,1,1
11,14,101001000,24,392,82204,1,6111,1,1
17,14,101002000,7,162,98100,1,6111,1,1
27,14,101030001,1,1,63246,1,1130,1,1
33,14,101030001,12,191,293556,1,2331,1,2
...,...,...,...,...,...,...,...,...,...
40160,12,9804029001,7,213,72000,1,9132,1,1
40164,12,9804029001,15,8005,54300,1,1314,1,1
40165,12,9804029001,17,537,58100,1,8321,1,1
40167,12,9804035000,1,25,50400,1,8321,1,1


## EDA Questions

* How does the average salary of the top job type in each region compare to the region's overall average salary?
* Which region has the highest overall average salary for non-agricultural jobs?
* What are the most common occupations in the dataset, and how many people are employed in each occupation across different regions?
* Is there a notable difference in the salary distribution by gender for sole earners in non-agricultural roles?


Q1: How does the average salary of the top job type in each region compare to the region's overall average salary?

Q2: Which Region has the highest overall average salary for non-agricultural jobs?

In [86]:
grouped_regions = clean_df.groupby(["W_REGN"])
grouped_regions = grouped_regions.agg({"NONAGRI_SAL" : "mean"})
grouped_regions

Unnamed: 0_level_0,NONAGRI_SAL
W_REGN,Unnamed: 1_level_1
1,83418.048077
2,101628.540761
3,108091.371648
5,67089.026899
6,84732.004329
7,82392.659409
8,68829.128596
9,80824.521459
10,93660.757112
11,95320.280639


In [87]:
highest_mean_salary = grouped_regions["NONAGRI_SAL"].max()
highest_region = grouped_regions["NONAGRI_SAL"].idxmax()
print(f"Region {highest_region} has the highest mean non-agricultural salary: ", "{:.2f}".format(highest_mean_salary))

Region 13 has the highest mean non-agricultural salary:  159717.67


Q3: What are the top 10 common occupations in the dataset, and how many people are employed in each occupation across different regions?
-- mode for each occupation
-- sort
-- get top 10

In [92]:
grouped_occupations = clean_df.groupby(["W_REGN","NONAGRI_SAL"])
grouped_occupations = grouped_occupations.agg({"NONAGRI_SAL" : "count"})
grouped_occupations.head(20)


Unnamed: 0_level_0,Unnamed: 1_level_0,NONAGRI_SAL
W_REGN,NONAGRI_SAL,Unnamed: 2_level_1
1,750,1
1,1000,1
1,1500,1
1,1860,1
1,2220,1
1,2394,1
1,2700,2
1,2720,1
1,3000,3
1,3600,1


Q4: Is there a notable difference in the salary distribution by gender for sole earners in non-agricultural roles?

In [None]:
grouped_sex = clean_df.groupby(["SEX"])

## Research Question
__1. What type of job has the highest average salary in each region, considering only individuals who work in non-agricultural jobs?__