## Family Income and Expenditure Survey 2012
__Group 5 Members:__
* Justine Valdes
* Joshua Esleta
* John Liong
* Mark Musngi


# Import Libraries

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import ttest_ind

## DataFrame

Variables and Dictionaries are in the file "fies_2012_v1_metadata(dictionary)"

In [44]:
df = pd.read_csv("FIES PUF 2012 Vol.1.csv")
df

Unnamed: 0,W_REGN,W_OID,W_SHSN,W_HCN,URB,RSTR,PSU,BWEIGHT,RFACT,FSIZE,...,PC_QTY,OVEN_QTY,MOTOR_BANCA_QTY,MOTORCYCLE_QTY,POP_ADJ,PCINC,NATPC,NATDC,REGDC,REGPC
0,14,101001000,2,25,2,21100,415052,138.25,200.6576,3.0,...,01,01,,,0.946172,108417.00,9,8,8,9
1,14,101001000,3,43,2,21100,415052,138.25,200.6576,12.5,...,,01,,01,0.946172,30631.60,5,9,9,4
2,14,101001000,4,62,2,21100,415052,138.25,200.6576,2.0,...,,01,,,0.946172,86992.50,9,6,6,8
3,14,101001000,5,79,2,21100,415052,138.25,200.6576,4.0,...,,01,,,0.946172,43325.75,6,6,6,6
4,14,101001000,10,165,2,21100,415052,138.25,200.6576,5.0,...,,,,01,0.946172,37481.80,6,6,6,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40166,12,9804029001,18,568,1,22000,114062,271.25,963.2264,5.0,...,00,00,00,00,0.902863,30101.60,5,5,7,7
40167,12,9804035000,1,25,2,22000,414067,271.25,588.6253,9.0,...,00,01,00,00,0.902863,14368.89,1,5,7,3
40168,12,9804035000,2,51,2,22000,414067,271.25,588.6253,6.0,...,00,00,00,00,0.902863,19137.33,3,4,6,4
40169,12,9804035000,3,75,2,22000,414067,271.25,588.6253,5.0,...,00,01,00,00,0.902863,30985.00,5,6,7,7


## Data Cleaning
* W_REGN
* NONAGRI_SAL
* EMPLOYED_PAY
* OCCUP
* JOB

In [45]:
# Select only wanted variables
selected_df = df[["W_REGN","W_OID","W_SHSN","W_HCN","NONAGRI_SAL","EMPLOYED_PAY","OCCUP","JOB"]]

In [51]:
selected_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21427 entries, 4 to 40169
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   W_REGN        21427 non-null  int64 
 1   W_OID         21427 non-null  int64 
 2   W_SHSN        21427 non-null  int64 
 3   W_HCN         21427 non-null  int64 
 4   NONAGRI_SAL   21427 non-null  int64 
 5   EMPLOYED_PAY  21427 non-null  object
 6   OCCUP         21427 non-null  object
 7   JOB           21427 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.5+ MB


## Dropped Duplicates


In [46]:
selected_df = selected_df.drop_duplicates()

In [47]:
selected_df = selected_df.dropna(subset=["OCCUP"])
selected_df

Unnamed: 0,W_REGN,W_OID,W_SHSN,W_HCN,NONAGRI_SAL,EMPLOYED_PAY,OCCUP,JOB
0,14,101001000,2,25,0,,,2
1,14,101001000,3,43,0,,1314,1
2,14,101001000,4,62,0,,,2
3,14,101001000,5,79,0,,6111,1
4,14,101001000,10,165,32000,01,5220,1
...,...,...,...,...,...,...,...,...
40166,12,9804029001,18,568,0,00,1314,1
40167,12,9804035000,1,25,50400,01,8321,1
40168,12,9804035000,2,51,0,00,1314,1
40169,12,9804035000,3,75,100761,01,1120,1


## Drop those who doesnt have a job

In [48]:
selected_df = selected_df[selected_df["JOB"] != 2]
selected_df

Unnamed: 0,W_REGN,W_OID,W_SHSN,W_HCN,NONAGRI_SAL,EMPLOYED_PAY,OCCUP,JOB
1,14,101001000,3,43,0,,1314,1
3,14,101001000,5,79,0,,6111,1
4,14,101001000,10,165,32000,01,5220,1
5,14,101001000,14,229,749628,03,6212,1
7,14,101001000,18,295,0,02,6111,1
...,...,...,...,...,...,...,...,...
40166,12,9804029001,18,568,0,00,1314,1
40167,12,9804035000,1,25,50400,01,8321,1
40168,12,9804035000,2,51,0,00,1314,1
40169,12,9804035000,3,75,100761,01,1120,1


## Drop Variable who aren't in agriculture

In [49]:
selected_df = selected_df[selected_df["NONAGRI_SAL"] > 0]
selected_df

Unnamed: 0,W_REGN,W_OID,W_SHSN,W_HCN,NONAGRI_SAL,EMPLOYED_PAY,OCCUP,JOB
4,14,101001000,10,165,32000,01,5220,1
5,14,101001000,14,229,749628,03,6212,1
11,14,101001000,24,392,82204,01,6111,1
13,14,101002000,2,44,6900,02,5132,1
17,14,101002000,7,162,98100,01,6111,1
...,...,...,...,...,...,...,...,...
40163,12,9804029001,13,8004,54300,,8321,1
40164,12,9804029001,15,8005,54300,01,1314,1
40165,12,9804029001,17,537,58100,01,8321,1
40167,12,9804035000,1,25,50400,01,8321,1


## Convert Employed Pay from string to int

In [52]:
selected_df["EMPLOYED_PAY"] = selected_df["EMPLOYED_PAY"].astype(int)
employed_df = selected_df[["EMPLOYED_PAY"]]
employed_df.head(40)

ValueError: invalid literal for int() with base 10: '  '

## Drop Rows who has 2 or more people who are working in a household

In [50]:
selected_df = selected_df[selected_df["EMPLOYED_PAY"] <= 2]
selected_df

TypeError: '<=' not supported between instances of 'str' and 'int'