In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# ignore warnings
import scipy.stats as stats
warnings.filterwarnings('ignore')
##### Read & preprocess the data (from kostat) #####
df_raw23 = pd.read_csv('data/2023_가구마스터_20240501_50723.csv',encoding='CP949')
selected_cols = ["조사연도","수도권여부","MD제공용_가구고유번호","가구주_성별코드","가구주_만연령","가구원수","가구주_교육정도_학력코드","가구주_혼인상태코드","자산","부채","처분가능소득(보완)[경상소득(보완)-비소비지출(보완)]","가구주_산업대분류코드","가구주_직업대분류코드","입주형태코드"]
df23 = df_raw23[selected_cols]
df23.rename(columns={"조사연도":"year","수도권여부":"metro","MD제공용_가구고유번호":"id","가구주_성별코드":"sex","가구주_만연령":"age","가구원수":"number","가구주_교육정도_학력코드":"education","가구주_혼인상태코드":"marriage","자산":"asset","부채":"debt","처분가능소득(보완)[경상소득(보완)-비소비지출(보완)]":"income","가구주_산업대분류코드":"industry","가구주_직업대분류코드":"job","입주형태코드":"house"},inplace=True)
print(df23.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18094 entries, 0 to 18093
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   year       18094 non-null  int64 
 1   metro      18094 non-null  object
 2   id         18094 non-null  int64 
 3   sex        18094 non-null  int64 
 4   age        18094 non-null  int64 
 5   number     18094 non-null  int64 
 6   education  18094 non-null  int64 
 7   marriage   18094 non-null  int64 
 8   asset      18094 non-null  int64 
 9   debt       18094 non-null  int64 
 10  income     18094 non-null  int64 
 11  industry   13356 non-null  object
 12  job        13356 non-null  object
 13  house      18094 non-null  int64 
dtypes: int64(11), object(3)
memory usage: 1.9+ MB
None


In [4]:
##### Test of income difference according to occupation #####
##### occupational types #####
# 1 : 관리자
# 2 : 전문가 및 관련 종사자
# 3 : 사무 종사자
# 4 : 서비스 종사자
# 5 : 판매 종사자
# 6 : 농림어업 숙련？종사자
# 7 : 기능원 및 관련 기능 종사자
# 8 : 장치,기계조작 및 조립 종사자
# 9 : 단순노무 종사자
# A : 군인
print("Job category : ", df23['job'].unique())
df23 = df23[['sex','job']]
print(df23.info())

Job category :  ['4' '5' '2' '9' nan '3' '8' '7' '1' '6' 'A']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18094 entries, 0 to 18093
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   sex     18094 non-null  int64 
 1   job     13356 non-null  object
dtypes: int64(1), object(1)
memory usage: 282.8+ KB
None


In [5]:
print("# of nulls :", df23['job'].isnull().sum())
df23 = df23.dropna()
print(df23.info())

# of nulls : 4738
<class 'pandas.core.frame.DataFrame'>
Index: 13356 entries, 0 to 18093
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   sex     13356 non-null  int64 
 1   job     13356 non-null  object
dtypes: int64(1), object(1)
memory usage: 313.0+ KB
None


In [6]:
##### Option: Removal of groups (soldiers) comprised mostly of men. (as a duty in Korea)#####
df23 = df23.drop(df23[(df23.job=="A")].index)
print(df23.info())

<class 'pandas.core.frame.DataFrame'>
Index: 13330 entries, 0 to 18093
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   sex     13330 non-null  int64 
 1   job     13330 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.4+ KB
None


In [7]:
##### Display the observation frequency: Gender distribution by job #####
print("##### Observation Frequency #####")
tab1 = pd.crosstab(df23.sex, df23.job, margins=True) #margins=True : Include total values
print(tab1)

##### Observation Frequency #####
job    1     2     3     4     5     6     7     8     9    All
sex                                                            
1    285  1572  1519   581   808  1024  1371  1716  1339  10215
2     17   542   388   701   327   192    76    78   794   3115
All  302  2114  1907  1282  1135  1216  1447  1794  2133  13330


Exercise 4.

In [13]:
# 데이터 로드 및 전처리
df_raw23 = pd.read_csv('data/2023_가구마스터_20240501_50723.csv', encoding='CP949')
selected_cols = ["조사연도","수도권여부","MD제공용_가구고유번호","가구주_성별코드","가구주_만연령","가구원수",
                 "가구주_교육정도_학력코드","가구주_혼인상태코드","자산","부채",
                 "처분가능소득(보완)[경상소득(보완)-비소비지출(보완)]","가구주_산업대분류코드",
                 "가구주_직업대분류코드","입주형태코드"]
df23 = df_raw23[selected_cols]
df23.rename(columns={"조사연도":"year","수도권여부":"metro","MD제공용_가구고유번호":"id",
                     "가구주_성별코드":"sex","가구주_만연령":"age","가구원수":"number",
                     "가구주_교육정도_학력코드":"education","가구주_혼인상태코드":"marriage",
                     "자산":"asset","부채":"debt",
                     "처분가능소득(보완)[경상소득(보완)-비소비지출(보완)]":"income",
                     "가구주_산업대분류코드":"industry","가구주_직업대분류코드":"job",
                     "입주형태코드":"house"}, inplace=True)


# 필요한 컬럼만 선택
df23 = df23[['income', 'education']]


In [18]:
df23.describe()

Unnamed: 0,income,education
count,18094.0,18094.0
mean,5053.512656,4.258428
std,4935.612087,1.626067
min,-22848.0,1.0
25%,1977.0,3.0
50%,3890.5,4.0
75%,6751.0,6.0
max,223958.0,8.0


In [15]:
# 교육 수준에 따라 그룹화
target1 = [1, 2, 3]
target2 = [4, 5]
target3 = [6, 7, 8]

df23_L = df23[df23['education'].isin(target1)]
df23_M = df23[df23['education'].isin(target2)]
df23_H = df23[df23['education'].isin(target3)]

# 그룹별 평균 계산
mean_L = df23_L['income'].mean()
mean_M = df23_M['income'].mean()
mean_H = df23_H['income'].mean()

# 그룹 간 분산(SSb) 계산
overall_mean = df23['income'].mean()
SSb_L = len(df23_L) * (mean_L - overall_mean)**2
SSb_M = len(df23_M) * (mean_M - overall_mean)**2
SSb_H = len(df23_H) * (mean_H - overall_mean)**2
SSb = SSb_L + SSb_M + SSb_H

# 그룹 내 분산(SSw) 계산
SSw_L = ((df23_L['income'] - mean_L)**2).sum()
SSw_M = ((df23_M['income'] - mean_M)**2).sum()
SSw_H = ((df23_H['income'] - mean_H)**2).sum()
SSw = SSw_L + SSw_M + SSw_H

# 자유도 계산
DFb = 2  # 그룹 수 - 1
DFw = len(df23) - 3  # 전체 샘플 수 - 그룹 수

# 평균제곱 계산
MSb = SSb / DFb
MSw = SSw / DFw

# F-통계량 계산
f_score = MSb / MSw

# P-value 계산
p_value = 1 - stats.f.cdf(f_score, DFb, DFw)

# 결과 출력
print(f"F-Score: {f_score}")
print(f"P-Value: {p_value}")

# 유의 수준 0.05와 비교하여 결과 해석
alpha = 0.05
if p_value < alpha:
    print("귀무가설을 기각합니다. 교육 수준에 따른 소득 차이가 유의미합니다.")
else:
    print("귀무가설을 기각할 수 없습니다. 교육 수준에 따른 소득 차이가 유의미하지 않습니다.")

F-Score: 1315.73855791482
P-Value: 1.1102230246251565e-16
귀무가설을 기각합니다. 교육 수준에 따른 소득 차이가 유의미합니다.
