In [215]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import numpy  as np
import seaborn as sns
%matplotlib inline

path = './data/'
pd.options.display.max_rows = 150
pd.options.display.max_columns = 350
plt.rc('font',family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

# 데이터로드

In [216]:
sigungu = pd.read_csv(path+'sigungu_imp_10.csv', encoding='cp949')
sigungu_3 = pd.read_csv(path+'시군구별지역안전지표_2015.csv', encoding='cp949')

In [217]:
y_bin = sigungu.iloc[:,-7:]    #등급(binary)
y_deci = sigungu_3.iloc[:,-7:] #등급(1-5)
X = sigungu.iloc[:,4:-7]       #독립변수X
name = sigungu.iloc[:,:4]      #이름

In [218]:
y_bin.columns = ['fire','transport','disaster','crime','accident','suicide','infection']

In [219]:
sigungu_xy = pd.concat([X,y_bin], axis=1)
sigungu_nxy = pd.concat([name, sigungu_xy,y_deci], axis=1)
sigungu_nxy.shape

(226, 162)

# 지표추가

In [220]:
#성비 추가
성비_2015 = pd.read_csv(path+'성비_2015.csv', encoding='cp949')
성비_2015.drop(['연령별'],axis=1, inplace=True)
성비_2015.columns=['시도','시군구','성비']
sigungu_nxy= sigungu_nxy.merge(성비_2015)
성비_2015.shape

(281, 3)

In [221]:
#외국인비율추가
transpath = 'C:/Users/COM/Desktop/교통사고/'
주민등록인구=pd.read_csv(transpath+'주민등록인구.csv', encoding='cp949')
주민등록인구.rename(columns={'전국':'시도','소계':'시군구','주민등록인구':'전체인구'}, inplace=True)
주민등록인구['외국인비율'] = 주민등록인구.등록외국인수 / 주민등록인구.전체인구*100
sigungu_nxy
sigungu_nxy=sigungu_nxy.merge(주민등록인구, on=['시도','시군구'])
주민등록인구.shape

(245, 8)

In [222]:
#제조업비율추가
manufact = pd.read_csv(transpath +'2017_제조업체_사업체.csv', encoding='cp949')
manufact.head(1)
manufact['제조업비율'] = (manufact['제조사업체수']/manufact['사업체수'])
manufact.head(1)
sigungu_nxy.head(1)
sigungu_nxy = sigungu_nxy.merge(manufact, on=['시도','시군구'])
sigungu_nxy.shape

(225, 172)

In [223]:
#음주사고비율 추가(음주교통사고/일반교통사고)
sigungu_nxy['음주교통사고 발생비율'] = sigungu_nxy['음주교통사고 발생건수']/sigungu_nxy['교통사고 발생건수']

# 함수선언
### 중앙선그리는 함수

In [224]:
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
def scatter(x, y, color, df, hovername=None,text=False, medianline=False, palette=['dimgrey','mediumorchid'], trendline=None):
    #산점도------
    if text == True:
        fig = px.scatter(df, x=x, y=y, text='시군구',trendline=trendline,
                             hover_name=hovername, color=color, color_discrete_sequence=palette)
        fig.update_traces(textposition='top center')
    else:
        fig = px.scatter(df, x=x, y=y,trendline=trendline,
                         hover_name=hovername, color=color, color_discrete_sequence=palette)
        fig.update_traces(textposition='top center')    
    #중앙값-----
    if medianline==True:
        #x median
        fig.add_trace(go.Scatter(x=[df[x].median()]*2,
                       y=[-10000,10000],
                       name="median x: "+str(round(df[x].median(),2)),
                       mode="lines",
                       line = dict(color = ('rgb(0,0,0)'),width=0.5),
                       visible=True,))
        #y median
        fig.add_trace(go.Scatter(x=[-10000,10000],
                       y=[df[y].median()]*2,
                       name="median y: "+str(round(df[y].median(),2)),
                       mode="lines",
                       line = dict(color = ('rgb(0,0,0)'),width=0.5),
                       visible=True))
        # 레전이동      fig.update_layout(legend=dict(x=-.05, y=1.15))
    
    #트렌드라인----
    regline = sm.OLS(df[y],sm.add_constant(df[x])).fit().fittedvalues
    fig.add_traces(go.Scatter(x=df[x], y=regline,
                          mode = 'lines',
                          marker_color='black',
                        line = dict(color = ('rgb(0,0,0)'),width=0.5,dash='7px'),
                          name='trend all'))   
    #플롯 레이아웃(xy범위)----
    fig.update_xaxes(range=[df[x].min()*0.8, df[x].max()*1.05]) #min값 음수인지 아닌지 확인 후 범위 조절
    fig.update_yaxes(range=[df[y].min()*0.5, df[y].max()*1.05]) #min값 음수인지 아닌지 확인 후 범위 조절
    fig.show()

In [225]:
#팔레트정의
darkcyan= ['dimgrey','darkcyan']
darkorange= ['dimgrey','darkorange']

### 회귀식기준 위아래 갯수세기 함수

In [226]:
from scipy import stats
def total_regression(x_,y_, totaldf,groupdf):
    #전체에 대한 xy
    x = totaldf[x_].tolist()
    y = totaldf[y_].tolist()
    #전체에 대한 회귀식생성
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    #지표출력
    print(stats.linregress(x,y))
    print('Rsquared=',stats.linregress(x,y)[2]**2)
        #group에 포함되는 xy
    x = groupdf[x_].tolist()
    y = groupdf[y_].tolist()
    y_pred = []
    for i in range(len(x)):
        y_pred.append(slope*np.array(x[i])+intercept)
    upper = 0
    lower = 0
    for i in range(len(y)):
        if y[i]>=y_pred[i]:
            upper +=1
        else:
            lower +=1
    return pd.DataFrame({'':['total','upper','lower'],
                          'count':[len(y),upper,lower],
                          'ratio':[len(y)/len(y), upper/len(y), lower/len(y)]})

# 그룹분류

In [227]:
#2)데이터프레임의 새로운 열을 만들어 타겟팅 그룹을 구분한다.
#1사분면의 모든 주황초록점 : group1, 1사분면 주황점: group2
sigungu_nxy['group1'] = np.zeros(len(sigungu_nxy))
sigungu_nxy['group2'] = np.zeros(len(sigungu_nxy))
#조건을 만족하는 행 인덱스 가져오기
cond = (sigungu_nxy['제조업 업체수']>sigungu_nxy['제조업 업체수'].median())&((sigungu_nxy['제조업 종사자수']>sigungu_nxy['제조업 종사자수'].median()))
group1_index = sigungu_nxy.loc[cond].index
group2_index = sigungu_nxy.loc[cond&sigungu_nxy.transport==1].index
#group1에 속하는 애들을 1로 표시
sigungu_nxy.loc[sigungu_nxy.index.isin(group1_index),'group1'] = 1
sigungu_nxy.loc[sigungu_nxy.index.isin(group2_index),'group2'] = 1

In [228]:
#group1/2 데이터프레임 분리
group1 = sigungu_nxy.loc[sigungu_nxy.group1==1]
group2 = sigungu_nxy.loc[sigungu_nxy.group2==1]

# 회귀식

In [229]:
import statsmodels.api as sm

In [230]:
# sigungu_nxy.group1 = sigungu_nxy.group1.astype(int)
# sigungu_nxy.group2 = sigungu_nxy.group2.astype(int)
total_regression(x_='교통사고 발생건수',y_='음주교통사고 발생건수', totaldf=sigungu_nxy,groupdf=group2)

LinregressResult(slope=0.1338744307398634, intercept=19.903327511470827, rvalue=0.6445062175128073, pvalue=8.339258322897315e-28, stderr=0.010635351767405677)
Rsquared= 0.41538826441266613


Unnamed: 0,Unnamed: 1,count,ratio
0,total,40,1.0
1,upper,28,0.7
2,lower,12,0.3


In [231]:
group2.rename(columns={'음주교통사고 발생비율':'음주교통사고발생비율',
                       '음주교통사고 발생건수':'음주교통사고발생건수',
                       '주민등록인구(여자)':'주민등록인구여자',
              '제조업 업체수':'제조업업체수',
              '제조업 종사자수':'제조업종사자수',
              '교통사고 발생건수':'교통사고발생건수'}, inplace=True)
reg = sm.OLS.from_formula("음주교통사고발생비율 ~ 제조업비율+성비+외국인비율", group2).fit()
reg = sm.OLS.from_formula("음주교통사고발생건수 ~ 제조업업체수+주민등록인구여자+외국인수", group2).fit()
reg.summary()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



0,1,2,3
Dep. Variable:,음주교통사고발생건수,R-squared:,0.329
Model:,OLS,Adj. R-squared:,0.273
Method:,Least Squares,F-statistic:,5.874
Date:,"Wed, 04 Dec 2019",Prob (F-statistic):,0.00226
Time:,16:21:49,Log-Likelihood:,-174.86
No. Observations:,40,AIC:,357.7
Df Residuals:,36,BIC:,364.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-445.5705,192.752,-2.312,0.027,-836.490,-54.651
제조업업체수,0.0613,0.028,2.221,0.033,0.005,0.117
주민등록인구여자,0.0971,0.038,2.540,0.016,0.020,0.175
외국인수,0.0344,0.024,1.452,0.155,-0.014,0.083

0,1,2,3
Omnibus:,35.937,Durbin-Watson:,2.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,120.741
Skew:,2.092,Prob(JB):,6.0400000000000005e-27
Kurtosis:,10.412,Cond. No.,301000.0


In [232]:
sigungu_nxy.rename(columns={'음주교통사고 발생비율':'음주교통사고발생비율',
                      '제조업 업체수':'제조업업체수',
                      '제조업 종사자수':'제조업종사자수',
                      '교통사고 발생건수':'교통사고발생건수',
                      '교통사고 사망자수':'교통사고사망자수',
                      '교통사고 부상자수':'교통사고부상자수',
                      '의료보장 사업장수':'의료보장사업장수',
                            '음주교통사고 발생건수':'음주교통사고발생건수',
                            '주민등록인구(여자)':'주민등록인구여자',
                      '1인가구수':'일인가구수',
                      '학교수(중학교)':'중학교수',
                      '음식점 및 주점업 종사자수':'음식점및주점업종사자수',
                      '음식점 및 주점업 업체수':'음식점및주점업업체수',
                      '지역안전도(점수)':'지역안전도'}, inplace=True)
reg = sm.OLS.from_formula("음주교통사고발생비율 ~ 제조업업체수+성비+외국인수", sigungu_nxy).fit()
reg = sm.OLS.from_formula("음주교통사고발생비율 ~ 제조업비율+성비+외국인비율", sigungu_nxy).fit()
reg = sm.OLS.from_formula("음주교통사고발생건수 ~ 제조업업체수+주민등록인구여자+외국인수", sigungu_nxy).fit()
reg.summary()

0,1,2,3
Dep. Variable:,음주교통사고발생건수,R-squared:,0.207
Model:,OLS,Adj. R-squared:,0.196
Method:,Least Squares,F-statistic:,19.23
Date:,"Wed, 04 Dec 2019",Prob (F-statistic):,4.06e-11
Time:,16:21:49,Log-Likelihood:,-905.26
No. Observations:,225,AIC:,1819.0
Df Residuals:,221,BIC:,1832.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-87.2742,38.859,-2.246,0.026,-163.856,-10.692
제조업업체수,0.0724,0.013,5.666,0.000,0.047,0.098
주민등록인구여자,0.0253,0.008,3.276,0.001,0.010,0.040
외국인수,0.0069,0.007,1.039,0.300,-0.006,0.020

0,1,2,3
Omnibus:,113.389,Durbin-Watson:,1.699
Prob(Omnibus):,0.0,Jarque-Bera (JB):,985.456
Skew:,1.753,Prob(JB):,1.03e-214
Kurtosis:,12.634,Cond. No.,213000.0


In [233]:
group1.rename(columns={'음주교통사고 발생비율':'음주교통사고발생비율',
                      '제조업 업체수':'제조업업체수',
                      '제조업 종사자수':'제조업종사자수',
                      '교통사고 발생건수':'교통사고발생건수',
                      '교통사고 사망자수':'교통사고사망자수',
                      '교통사고 부상자수':'교통사고부상자수',
                      '의료보장 사업장수':'의료보장사업장수',
                      '1인가구수':'일인가구수',
                      '학교수(중학교)':'중학교수',
                      '음식점 및 주점업 종사자수':'음식점및주점업종사자수',
                      '음식점 및 주점업 업체수':'음식점및주점업업체수',
                       '음주교통사고 발생건수':'음주교통사고발생건수',
                            '주민등록인구(여자)':'주민등록인구여자',
                      '지역안전도(점수)':'지역안전도'}, inplace=True)
reg = sm.OLS.from_formula("음주교통사고발생비율 ~ 제조업업체수+성비+외국인수", group1).fit()
reg = sm.OLS.from_formula("음주교통사고발생비율 ~ 제조업비율+성비+외국인비율", group1).fit()
reg = sm.OLS.from_formula("음주교통사고발생건수 ~ 제조업업체수+주민등록인구여자+외국인수", group1).fit()
reg.summary()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



0,1,2,3
Dep. Variable:,음주교통사고발생건수,R-squared:,0.257
Model:,OLS,Adj. R-squared:,0.23
Method:,Least Squares,F-statistic:,9.248
Date:,"Wed, 04 Dec 2019",Prob (F-statistic):,2.54e-05
Time:,16:21:49,Log-Likelihood:,-350.23
No. Observations:,84,AIC:,708.5
Df Residuals:,80,BIC:,718.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-349.3191,120.848,-2.891,0.005,-589.815,-108.824
제조업업체수,0.0612,0.018,3.325,0.001,0.025,0.098
주민등록인구여자,0.0777,0.024,3.233,0.002,0.030,0.126
외국인수,0.0205,0.012,1.662,0.100,-0.004,0.045

0,1,2,3
Omnibus:,66.437,Durbin-Watson:,1.644
Prob(Omnibus):,0.0,Jarque-Bera (JB):,505.195
Skew:,2.315,Prob(JB):,1.99e-110
Kurtosis:,14.086,Cond. No.,343000.0


In [234]:
sigungu_nxy['transport_'] = sigungu_nxy.transport.astype(str)
sigungu_nxy['group1_'] = sigungu_nxy.group1.astype(str)
sigungu_nxy['group2_'] = sigungu_nxy.group2.astype(str)
group1['transport_'] = group1.transport.astype(str)
group2['transport_'] = group2.transport.astype(str)
group1['group1_'] = group1.group1.astype(str)
group2['group2_'] = group2.group2.astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stab

In [235]:
### 1. 가설검증의 대상이 되는 집단을 그룹화
#1)두개의 변수와 중앙값을 기준으로 각 사분면이 분리된다.
scatter(x="제조업업체수", y="제조업종사자수", color="transport_", df=sigungu_nxy,
        medianline=True, hovername='지역', trendline='ols')


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



In [236]:
#1)산점도로 회귀선 기준 타겟팅지역의 분포를 확인
# scatter(x="교통사고 발생건수", y="음주교통사고 발생비율", color="group2", df=sigungu_nxy,
#         medianline=1, palette='picnic', hovername='지역', trendline='ols')
scatter(x="교통사고발생건수", y="음주교통사고발생건수", color="group2_", df=sigungu_nxy,
        medianline=1, palette=darkorange,hovername='지역', trendline='ols')


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



In [237]:
#음주교통사고에 영향있는 변수
sigungu_nxy.corr(method='pearson').loc[:,'음주교통사고발생비율'].sort_values(ascending=False).head(30)

음주교통사고발생비율               1.000000
독거노인수                    0.773474
고령인구수                    0.729859
재난약자수                    0.725344
주민등록인구(60세이상)            0.719405
장애인수                     0.710104
중학교수                     0.683018
경찰관서수                    0.673253
공무원수(정원)                 0.663760
기초수급자수(65세이상)            0.663001
노인 교통사고 사망자수             0.659646
감염병 사망자수                 0.657864
질병이환 및 사망외인으로 인한 사망자수    0.655797
건강보험급여실적                 0.652297
일인가구수                    0.631732
교통사고사망자수                 0.628827
학교수(초등학교)                0.606483
기초수급자수                   0.573017
학교수(고등학교)                0.542116
건설업 업체수                  0.536210
빈집수                      0.534480
추락 사망자수                  0.483413
자연재해위험개선지구 수             0.471563
구조구급대원수                  0.468498
화재 발생건수                  0.450371
임야 면적                    0.444690
노인 교통사고 부상자수             0.420745
감염병                      0.399138
행정구역 면적                  0.389994
익사자수          