In [71]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

import wordcloud
from wordcloud import WordCloud
import json
import folium
%matplotlib inline
# 한글 설정
# pip install koreanize_matplotlib
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
root = 'C:/workspace/python/project/data/'

# 전처리 완료한 데이터파일 이동경로
pre_root = 'C:/workspace/python/project/data/_전처리/'

# 구글드라이브 : https://drive.google.com/drive/folders/1zIzm1o8-3uxcWSU2DoWpB8aV0Oxdfz_P?usp=sharing

In [72]:
'''
                         /////--     데이터 불러오기     --/////
'''
# 공원
park_2021_df = pd.read_csv(pre_root + '공원_Data/전처리_공원수_2021.csv', encoding = 'cp949')
park_2022_df = pd.read_csv(pre_root + '공원_Data/전처리_공원수_2022.csv', encoding = 'cp949')
park_2023_df = pd.read_csv(pre_root + '공원_Data/전처리_공원수_2023.csv', encoding = 'cp949')

# 교통
## 버스
bus_2021_df = pd.read_csv(pre_root + '교통_Data/전처리_버스수_2021.csv', encoding = 'cp949')
bus_2022_df = pd.read_csv(pre_root + '교통_Data/전처리_버스수_2022.csv', encoding = 'cp949')
bus_2023_df = pd.read_csv(pre_root + '교통_Data/전처리_버스수_2023.csv', encoding = 'cp949')
## 지하철
train_2021_df = pd.read_csv(pre_root + '교통_Data/전처리_지하철수_2021.csv', encoding = 'cp949')
train_2022_df = pd.read_csv(pre_root + '교통_Data/전처리_지하철수_2022.csv', encoding = 'cp949')
train_2023_df = pd.read_csv(pre_root + '교통_Data/전처리_지하철수_2023.csv', encoding = 'cp949')

# 부동산
## 서울시 집값
house_2021_df = pd.read_csv(pre_root + '부동산_Data/전처리_부동산_2021.csv', encoding = 'cp949')
house_2022_df = pd.read_csv(pre_root + '부동산_Data/전처리_부동산_2022.csv', encoding = 'cp949')
house_2023_df = pd.read_csv(pre_root + '부동산_Data/전처리_부동산_2023.csv', encoding = 'cp949')
## 개발계획
develop_2021_df = pd.read_csv(pre_root + '부동산_Data/전처리_개발계획_2021.csv', encoding = 'cp949')
develop_2022_df = pd.read_csv(pre_root + '부동산_Data/전처리_개발계획_2022.csv', encoding = 'cp949')
develop_2023_df = pd.read_csv(pre_root + '부동산_Data/전처리_개발계획_2023.csv', encoding = 'cp949')

# 수요공급
demandSupply_2021_df = pd.read_csv(pre_root + '수요공급_Data/전처리_수요공급지수_2021.csv', encoding = 'cp949')
demandSupply_2022_df = pd.read_csv(pre_root + '수요공급_Data/전처리_수요공급지수_2022.csv', encoding = 'cp949')
demandSupply_2023_df = pd.read_csv(pre_root + '수요공급_Data/전처리_수요공급지수_2023.csv', encoding = 'cp949')

# 유통업체
distribute_2021_df = pd.read_csv(pre_root + '유통업체_Data/전처리_유통업체_2021.csv', encoding = 'cp949')
distribute_2022_df = pd.read_csv(pre_root + '유통업체_Data/전처리_유통업체_2022.csv', encoding = 'cp949')
distribute_2023_df = pd.read_csv(pre_root + '유통업체_Data/전처리_유통업체_2023.csv', encoding = 'cp949')

# 의료기관
hospital_2021_df = pd.read_csv(pre_root + '의료기관_Data/전처리_병원수_2021.csv', encoding = 'cp949')
hospital_2022_df = pd.read_csv(pre_root + '의료기관_Data/전처리_병원수_2022.csv', encoding = 'cp949')
hospital_2023_df = pd.read_csv(pre_root + '의료기관_Data/전처리_병원수_2023.csv', encoding = 'cp949')

# 인구수
population_2021_df = pd.read_csv(pre_root + '인구수_Data/전처리_인구수_2021.csv', encoding = 'cp949')
population_2022_df = pd.read_csv(pre_root + '인구수_Data/전처리_인구수_2022.csv', encoding = 'cp949')
population_2023_df = pd.read_csv(pre_root + '인구수_Data/전처리_인구수_2023.csv', encoding = 'cp949')

# 주거실태 (거래량)
volume_2021_df = pd.read_csv(pre_root + '주거실태_Data/전처리_거래량_2021.csv', encoding = 'cp949')
volume_2022_df = pd.read_csv(pre_root + '주거실태_Data/전처리_거래량_2022.csv', encoding = 'cp949')
volume_2023_df = pd.read_csv(pre_root + '주거실태_Data/전처리_거래량_2023.csv', encoding = 'cp949')

In [73]:
'''
                         /////--     데이터 병합 함수     --/////
          공원, 버스, 지하철, 부동산, 개발계획, 수요공급, 유통업체, 의료기관, 인구수, 거래량
'''
def merge_year_dataFrame(park_df, bus_df, train_df, house_df, develop_df, demandSupply_df,
                         distribute_df, hospital_df, population_df, volume_df, year) :
  # 공원
  park_df = park_df[['구', '합계_공원수 (개소)']]
  
  # 버스
  bus_df = bus_df['구'].value_counts().reset_index().rename(columns = {'count' : '버스 수'})
  
  # 지하철
  train_df = train_df[['구', '역개수']]
  
  # 부동산
  house_df = house_df[['구', '평균시세']]
  
  # 개발계획
  develop_df = develop_df[['구', '계_구역수 (개)']].rename(columns = {'계_구역수 (개)' : '개발계획_합계'})
  
  # 수요공급
  demandSupply_df = round(demandSupply_df.groupby('구')['수급등급'].mean(), 2).reset_index().rename(columns = {'수급등급' : '수요공급지수'})
  
  # 유통업체
  distribute_df = distribute_df.iloc[1:, :].reset_index(drop = True)
  condition = distribute_df.columns.str.contains('_개소') # 원하는 컬럼만 조회
  condition[0] = True
  distribute_df = distribute_df.loc[:, condition]
  column_names = {} # 컬럼 이름 변경용 딕셔너리
  columns = distribute_df.columns
  for column in columns :
    column_names[column] = column.split('_개소')[0]
  distribute_df = distribute_df.rename(columns = column_names)
  distribute_df = distribute_df.replace('-', 0)
  
  # 의료기관
  hospital_df = hospital_df[['구', '소계_병원수']].rename(columns = {'소계_병원수' : '병원수'})
  hospital_df = hospital_df.iloc[1:, :].reset_index(drop = True)
  
  # 인구수
  population_df
  
  # 주거실태 (거래량)
  volume_df = volume_df.drop(columns = '년도').rename(columns = {'동(호)수' : '거래량'})[['구', '거래량']]

  ### 데이터 병합
  merge_df = house_df.merge(park_df, on = '구').merge(bus_df, on = '구').merge(train_df, on = '구')
  merge_df = merge_df.merge(develop_df, on = '구').merge(demandSupply_df, on = '구')
  merge_df = merge_df.merge(distribute_df, on = '구').merge(hospital_df, on = '구')
  merge_df = merge_df.merge(population_df, on = '구').merge(volume_df, on = '구')

  merge_df['연도'] = year

  return merge_df

                    #######                            #######
                    ##  데이터프레임 데이터 타입 변환 함수  ##
                    #######                            #######
def df_to_float(df) : 
  df['대형마트'] = df['대형마트'].astype(float)
  df['백화점'] = df['백화점'].astype(float)
  df['전문점'] = df['전문점'].astype(float)
  df['쇼핑센터'] = df['쇼핑센터'].astype(float)
  df['복합쇼핑몰'] = df['복합쇼핑몰'].astype(float)
  return df

In [74]:
'''
                              /////--     데이터 병합     --/////
          공원, 버스, 지하철, 부동산, 개발계획, 수요공급, 유통업체, 의료기관, 인구수, 거래량
'''
merge_year_2021 = merge_year_dataFrame(park_2021_df, bus_2021_df, train_2021_df, house_2021_df,
                                       develop_2021_df, demandSupply_2021_df, distribute_2021_df,
                                       hospital_2021_df, population_2021_df, volume_2021_df,
                                       year = 2021)

merge_year_2022 = merge_year_dataFrame(park_2022_df, bus_2022_df, train_2022_df, house_2022_df,
                                       develop_2022_df, demandSupply_2022_df, distribute_2022_df,
                                       hospital_2022_df, population_2022_df, volume_2022_df,
                                       year = 2022)

merge_year_2023 = merge_year_dataFrame(park_2023_df, bus_2023_df, train_2023_df, house_2023_df,
                                       develop_2023_df, demandSupply_2023_df, distribute_2023_df,
                                       hospital_2023_df, population_2023_df, volume_2023_df,
                                       year = 2023)

In [75]:
ai_concat = pd.concat([merge_year_2021, merge_year_2022, merge_year_2023], ignore_index = True)
ai_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   구            75 non-null     object 
 1   평균시세         75 non-null     float64
 2   합계_공원수 (개소)  75 non-null     int64  
 3   버스 수         75 non-null     int64  
 4   역개수          75 non-null     int64  
 5   개발계획_합계      75 non-null     int64  
 6   수요공급지수       75 non-null     float64
 7   합계           75 non-null     int64  
 8   대형마트         75 non-null     object 
 9   백화점          75 non-null     object 
 10  전문점          75 non-null     object 
 11  쇼핑센터         75 non-null     object 
 12  복합쇼핑몰        75 non-null     object 
 13  그밖의 대규모점포    75 non-null     int64  
 14  병원수          75 non-null     int64  
 15  총인구          75 non-null     int64  
 16  내국인-계        75 non-null     int64  
 17  외국인-계        75 non-null     int64  
 18  거래량          75 non-null     int64  
 19  연도        

In [76]:
# 특정 데이터 타입 실수형으로 전환
ai_concat = df_to_float(ai_concat)

In [77]:
# 평균시세 값을 천만원 단위로 변환
# 평균시세 값이 너무 크면 오차도 너무 크게 나오기 때문
ai_concat['평균시세'] = round(ai_concat['평균시세'] / 10000, 6)

# 컬럼명을 '평균시세(천만원)'으로 변경
ai_concat.rename(columns={'평균시세': '평균시세(천만원)'}, inplace=True)
ai_concat.rename(columns={'합계_공원수 (개소)': '합계_공원수'}, inplace=True)

In [78]:
ai_concat

Unnamed: 0,구,평균시세(천만원),합계_공원수,버스 수,역개수,개발계획_합계,수요공급지수,합계,대형마트,백화점,전문점,쇼핑센터,복합쇼핑몰,그밖의 대규모점포,병원수,총인구,내국인-계,외국인-계,거래량,연도
0,강남구,13.801004,164,526,21,21,114.34,32,1.0,6.0,0.0,3.0,1.0,21,2790,503019,494171,8848,20748,2021
1,강동구,6.254912,129,396,14,11,114.34,10,3.0,1.0,0.0,0.0,0.0,6,852,451099,444799,6300,13813,2021
2,강북구,3.588932,84,411,3,8,115.90,12,1.0,1.0,0.0,2.0,0.0,8,467,292611,287693,4918,7839,2021
3,강서구,4.036603,173,597,9,20,111.83,15,3.0,1.0,0.0,1.0,0.0,10,887,559837,551470,8367,20483,2021
4,관악구,4.842737,134,466,4,12,111.83,21,1.0,1.0,0.0,1.0,3.0,15,710,495777,473263,22514,8169,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,용산구,15.290378,107,348,10,9,85.18,15,2.0,1.0,8.0,3.0,0.0,1,327,214791,200568,14223,3241,2023
71,은평구,4.966598,136,504,13,13,81.61,16,2.0,0.0,0.0,5.0,0.0,9,709,452988,447341,5647,7692,2023
72,종로구,7.549121,106,388,15,28,85.18,13,0.0,0.0,2.0,0.0,0.0,11,482,146179,133520,12659,2600,2023
73,중구,7.496633,72,224,23,14,85.18,54,2.0,3.0,2.0,9.0,0.0,38,595,127576,115926,11650,3253,2023


In [79]:
# 불필요한 변수 제거
ai_concat = ai_concat.drop(columns=["합계", "총인구"], errors="ignore")

# 파생변수 생성
ai_concat["1인당_공원수"] = ai_concat["합계_공원수"] / ai_concat["내국인-계"]
ai_concat["1인당_병원수"] = ai_concat["병원수"] / ai_concat["내국인-계"]
ai_concat["1인당_유통시설"] = (ai_concat["대형마트"] + ai_concat["백화점"] + ai_concat["전문점"] + ai_concat["쇼핑센터"] + ai_concat["복합쇼핑몰"]) / ai_concat["내국인-계"]

In [166]:
ai_concat

Unnamed: 0,구,평균시세(천만원),합계_공원수,버스 수,역개수,개발계획_합계,수요공급지수,대형마트,백화점,전문점,...,그밖의 대규모점포,병원수,내국인-계,외국인-계,거래량,연도,1인당_공원수,1인당_병원수,1인당_유통시설,증감률
0,강남구,13.801004,164,526,21,21,114.34,1.0,6.0,0.0,...,21,2790,494171,8848,20748,2021,0.000332,0.005646,0.000022,
1,강동구,6.254912,129,396,14,11,114.34,3.0,1.0,0.0,...,6,852,444799,6300,13813,2021,0.000290,0.001915,0.000009,
2,강북구,3.588932,84,411,3,8,115.90,1.0,1.0,0.0,...,8,467,287693,4918,7839,2021,0.000292,0.001623,0.000014,
3,강서구,4.036603,173,597,9,20,111.83,3.0,1.0,0.0,...,10,887,551470,8367,20483,2021,0.000314,0.001608,0.000009,
4,관악구,4.842737,134,466,4,12,111.83,1.0,1.0,0.0,...,15,710,473263,22514,8169,2021,0.000283,0.001500,0.000013,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,용산구,15.290378,107,348,10,9,85.18,2.0,1.0,8.0,...,1,327,200568,14223,3241,2023,0.000533,0.001630,0.000070,-0.059744
71,은평구,4.966598,136,504,13,13,81.61,2.0,0.0,0.0,...,9,709,447341,5647,7692,2023,0.000304,0.001585,0.000016,0.311845
72,종로구,7.549121,106,388,15,28,85.18,0.0,0.0,2.0,...,11,482,133520,12659,2600,2023,0.000794,0.003610,0.000015,0.232566
73,중구,7.496633,72,224,23,14,85.18,2.0,3.0,2.0,...,38,595,115926,11650,3253,2023,0.000621,0.005133,0.000138,0.454230


In [80]:
'''
              ///////    선택 1)   21 ~ 23 년도 데이터를 전부 섞은 후    ///////
                   8 : 2 비율로 무작위로 나눠서 훈련용, 테스트용으로 분리하기
'''
from sklearn.model_selection import train_test_split

# 독립변수와 종속변수 분리
X = ai_concat.drop(columns=['평균시세(천만원)', '구', '연도'])  # 종속변수와 제외할 컬럼 제거
y = ai_concat['평균시세(천만원)']  # 종속변수

# 데이터 분리: 80% 훈련용, 20% 테스트용
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 확인
print("훈련 데이터 크기:", X_train.shape, y_train.shape)
print("테스트 데이터 크기:", X_test.shape, y_test.shape)

훈련 데이터 크기: (60, 18) (60,)
테스트 데이터 크기: (15, 18) (15,)


In [186]:
'''
              ///////    선택 2)   21 ~ 22 년도 데이터를 테스트 용으로    ///////
                            23 년도 데이터를 훈련용으로 분리하기
'''
# 훈련용 데이터 : 2021, 2022
train_data = ai_concat[ai_concat['연도'] < 2023]

# 테스트용 데이터 : 2023
test_data = ai_concat[ai_concat['연도'] == 2023]

# X, y 분리
X_train = train_data.drop(['평균시세(천만원)', '구', '연도'], axis = 1)
y_train = train_data['평균시세(천만원)']

X_test = test_data.drop(['평균시세(천만원)', '구', '연도'], axis = 1)
y_test = test_data['평균시세(천만원)']

# 확인
print("훈련 데이터 크기:", X_train.shape, y_train.shape)
print("테스트 데이터 크기:", X_test.shape, y_test.shape)


훈련 데이터 크기: (50, 19) (50,)
테스트 데이터 크기: (25, 19) (25,)
MSE : 10.397211070932382
R2 Score : 0.3032534610929145


In [188]:
'''
(yg) 선형 회귀 코드             선택 1번
'''


# 독립변수와 종속변수 분리
X = ai_concat.drop(columns=['평균시세(천만원)', '구', '연도'])  # 종속변수와 제외할 컬럼 제거
y = ai_concat['평균시세(천만원)']  # 종속변수

# 데이터 분리: 80% 훈련용, 20% 테스트용
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print("MSE :" , mean_squared_error(y_test, pred))
print("R2 Score :", r2_score(y_test, pred))

MSE : 2.5060485807198734
R2 Score : 0.8012488343129899


In [190]:
'''
(yg) 선형 회귀 코드           선택 2
'''
# 훈련용 데이터 : 2021, 2022
train_data = ai_concat[ai_concat['연도'] < 2023]

# 테스트용 데이터 : 2023
test_data = ai_concat[ai_concat['연도'] == 2023]

# X, y 분리
X_train = train_data.drop(['평균시세(천만원)', '구', '연도'], axis = 1)
y_train = train_data['평균시세(천만원)']

X_test = test_data.drop(['평균시세(천만원)', '구', '연도'], axis = 1)
y_test = test_data['평균시세(천만원)']

# 선형 회귀
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print("MSE :" , mean_squared_error(y_test, pred))
print("R2 Score :", r2_score(y_test, pred))

훈련 데이터 크기: (50, 19) (50,)
테스트 데이터 크기: (25, 19) (25,)
MSE : 10.397211070932382
R2 Score : 0.3032534610929145


In [182]:
# 연도별 집값 증감률 계산 pct_change() = 백분률
ai_concat['증감률'] = ai_concat.groupby('구')['평균시세(천만원)'].pct_change()
ai_concat['증감률'] = ai_concat['증감률'].fillna(0)
display(ai_concat['증감률'])

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
        ...   
70   -0.059744
71    0.311845
72    0.232566
73    0.454230
74    0.010363
Name: 증감률, Length: 75, dtype: float64

In [184]:
gangnam_df = ai_concat[ai_concat['구'] == '강남구']
# 결과 확인
display(gangnam_df)

'''
증감률은 현재값과 이전값의 변화를 계산하므로,
2021년도 증감률데이터는 NaN 으로 나와 fillna(0) 실행
'''

Unnamed: 0,구,평균시세(천만원),합계_공원수,버스 수,역개수,개발계획_합계,수요공급지수,대형마트,백화점,전문점,...,그밖의 대규모점포,병원수,내국인-계,외국인-계,거래량,연도,1인당_공원수,1인당_병원수,1인당_유통시설,증감률
0,강남구,13.801004,164,526,21,21,114.34,1.0,6.0,0.0,...,21,2790,494171,8848,20748,2021,0.000332,0.005646,2.2e-05,0.0
25,강남구,12.369329,165,543,21,21,87.78,1.0,6.0,0.0,...,21,2815,491411,8738,11153,2022,0.000336,0.005728,2.2e-05,-0.103737
50,강남구,18.337926,165,547,21,21,84.83,1.0,6.0,0.0,...,21,2866,502796,8288,7045,2023,0.000328,0.0057,2.2e-05,0.482532


In [192]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 종속변수, 독립변수 분리 
X = ai_concat[[ '1인당_공원수', '1인당_병원수', '1인당_유통시설']]
y = ai_concat['평균시세(천만원)']

# 데이터 분리: 80% 훈련용, 20% 테스트용
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print("MSE :" , mean_squared_error(y_test, pred))
print("R2 Score :", r2_score(y_test, pred))

MSE : 5.807746261054821
R2 Score : 0.5393958647571614
