# 모듈 불러오기

#### 기본

In [1]:
import numpy as np
import pandas as pd
import joblib

In [2]:
# !conda install numpy 
# !conda install pandas
# !conda install scikit-learn
# !conda install scipy
# !conda install tensorflow
# !conda install matplotlib
# !conda install seaborn

# !pip install xgboost
# !pip install lightgbm
# !pip install catboost
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !pip install hyperopt
# !pip install -U imbalanced-learn
# !pip install missingno

#### 전처리

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

from sklearn import impute
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer

#### 리샘플링

In [4]:
from imblearn.over_sampling import (
    RandomOverSampler, 
    ADASYN, 
    SMOTE
)
from imblearn.under_sampling import (
    RandomUnderSampler, 
    TomekLinks, 
    CondensedNearestNeighbour, 
    OneSidedSelection, 
    EditedNearestNeighbours, 
    NeighbourhoodCleaningRule
)

#### 분석

In [5]:
from scipy.stats import skew, kurtosis
from scipy.stats import ttest_ind, f_oneway, pearsonr

#### 회귀

In [6]:
from sklearn.linear_model import LinearRegression as RL
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR
from catboost import CatBoostRegressor as CBR

from lightgbm import plot_importance as lgbm_plot_importance
from xgboost import plot_importance as xgb_plot_importance
from catboost import Pool

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

#### 분류

In [7]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBMC
from catboost import CatBoostClassifier as CBC

from sklearn.metrics import confusion_matrix as cmatrix
from sklearn.metrics import classification_report as creport
from sklearn.metrics import recall_score as recall
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

#### 교차검증

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from catboost import cv
import hyperopt

from sklearn.model_selection import (
    StratifiedKFold, # 분류
    KFold, # 회귀
    # GroupKFold, 
    # RepeatedKFold, 
    # StratifiedGroupKFold, 
    # RepeatedStratifiedKFold
)

#### 시각화

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rc('font', family='Malgun Gothic')
sns.set(font="Malgun Gothic",
        rc={"axes.unicode_minus":False}, # 마이너스 부호 깨짐 현상 해결
        style='darkgrid')  

import warnings
warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

# 데이터 로드

In [10]:
folder_name = '건물'
file_name = 'GIS건물통합정보_fix'

df_all_origin = pd.read_excel(f'./data/{folder_name}/{file_name}.xlsx', engine='openpyxl')

In [11]:
folder_name = '건물'
file_name = 'GIS건물통합정보_강서구'

df_origin = pd.read_excel(f'./data/{folder_name}/{file_name}.xlsx', engine='openpyxl')

In [12]:
df = df_origin.copy()

# 데이터 이해

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28900 entries, 0 to 28899
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  28900 non-null  int64         
 1   법정동명        28900 non-null  object        
 2   지번          28900 non-null  object        
 3   용도명         28900 non-null  object        
 4   구조명         28900 non-null  object        
 5   면적          28900 non-null  float64       
 6   사용승인일자      28900 non-null  datetime64[ns]
 7   연면적         28900 non-null  float64       
 8   대지면적        28900 non-null  float64       
 9   높이          28900 non-null  float64       
 10  건폐율         28900 non-null  float64       
 11  용적율         28900 non-null  float64       
 12  사용승인년도      28900 non-null  int64         
dtypes: datetime64[ns](1), float64(6), int64(2), object(4)
memory usage: 2.9+ MB


In [14]:
df.isna().sum()

Unnamed: 0    0
법정동명          0
지번            0
용도명           0
구조명           0
면적            0
사용승인일자        0
연면적           0
대지면적          0
높이            0
건폐율           0
용적율           0
사용승인년도        0
dtype: int64

# 전처리

In [None]:
drop_col = [
    'A0',
    'A1',
    'A2',
    'A3',
    'A6',
    'A7',
    'A8',
    'A10',
    'A19',
    'A20',
    'A21',
    'A22',
]

df.drop(drop_col, axis=1, inplace=True)

In [None]:
rename_col = {
 'A4': '법정동명', 
 'A5': '지번', 
 'A9': '용도명', 
 'A11': '구조명',
 'A12': '면적',
 'A13': '사용승인일자',
 'A14': '연면적',
 'A15': '대지면적',
 'A16': '높이',
 'A17': '건폐율',
 'A18': '용적율',
}

df.rename(rename_col, axis=1, inplace=True)

In [None]:
folder_name = '건물'
file_name = 'GIS건물통합정보'

df.to_excel(f'./data/{folder_name}/{file_name}_fix.xlsx', index=False, engine='openpyxl')

In [None]:
gangeo_df = df.loc[df['법정동명'].str.contains('강서구'), :]

In [None]:
gangeo_df.reset_index(drop=True, inplace=True)

In [None]:
gangseo_count = gangeo_df.count()
gangseo_count

법정동명      28900
지번        28900
용도명       23605
구조명       23604
면적        28900
사용승인일자    24092
연면적       28900
대지면적      28900
높이        28900
건폐율       28900
용적율       28900
dtype: int64

In [None]:
gangseo_isna = gangeo_df.isna().sum()
gangseo_isna

법정동명      0
지번        0
용도명       0
구조명       0
면적        0
사용승인일자    0
연면적       0
대지면적      0
높이        0
건폐율       0
용적율       0
dtype: int64

In [None]:
gangeo_df['용도명'].fillna('미상', inplace=True)
gangeo_df['구조명'].fillna('미상', inplace=True)
gangeo_df['사용승인일자'].fillna('1900-01-01', inplace=True)

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.contains('200205'), '사용승인일자'] = '2002-05-01'

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.contains('199710'), '사용승인일자'] = '1997-10-01'

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.contains('200212'), '사용승인일자'] = '2002-12-01'

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 5, '사용승인일자'] = gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 5, '사용승인일자'] + '-01-01'

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 7, '사용승인일자']

3159     199510
3160     199510
3293     199512
17871    199410
17995    199304
17996    199304
19455    200111
26929    199406
Name: 사용승인일자, dtype: object

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 7, '사용승인일자'] = gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 7, '사용승인일자'].str.slice(0, 4) + '-' + gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 7, '사용승인일자'].str.slice(4) + '-01'

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.len() < 8, '사용승인일자'] = '2004-10-04'

In [None]:
gangeo_df.loc[gangeo_df['사용승인일자'].str.contains('-00'), '사용승인일자'] = gangeo_df.loc[gangeo_df['사용승인일자'].str.contains('-00'), '사용승인일자'].str.slice(0,8) + '01'

In [None]:
gangeo_df['사용승인일자'] = pd.to_datetime(gangeo_df['사용승인일자'], format='%Y-%m-%d')

In [None]:
gangeo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28900 entries, 0 to 28899
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   법정동명    28900 non-null  object        
 1   지번      28900 non-null  object        
 2   용도명     28900 non-null  object        
 3   구조명     28900 non-null  object        
 4   면적      28900 non-null  float64       
 5   사용승인일자  28900 non-null  datetime64[ns]
 6   연면적     28900 non-null  float64       
 7   대지면적    28900 non-null  float64       
 8   높이      28900 non-null  float64       
 9   건폐율     28900 non-null  float64       
 10  용적율     28900 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(4)
memory usage: 2.4+ MB


# 파일 저장

In [None]:
folder_name = '건물'
file_name = '강서구_GIS건물통합정보'

gangeo_df.to_excel(f'./data/{folder_name}/{file_name}.xlsx', index=False, engine='openpyxl')