In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# 데이터셋을 불러오기 위해 판다스 라이브러리를 불러옵니다
import pandas as pd

train_url = 'https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/food_inspection_sc23x/food_ins_train.csv'
test_url  = 'https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/food_inspection_sc23x/food_ins_test.csv'

# train, test 데이터셋을 불러옵니다
train = pd.read_csv(train_url)
test  = pd.read_csv(test_url)

# 데이터셋 확인
assert train.shape == (60000, 17)
assert test.shape  == (20000, 17)

In [3]:
train.head(2)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Violations,Latitude,Longitude,Location,Inspection Fail
0,2050629,MY SWEET STATION INC,MY SWEET STATION,2327223.0,Restaurant,Risk 1 (High),2511 N LINCOLN AVE,CHICAGO,IL,60614.0,2017-05-18,Canvass,,41.927577,-87.651528,"(-87.65152817242594, 41.92757677830966)",0
1,2078428,OUTTAKES,RED MANGO,2125004.0,Restaurant,Risk 2 (Medium),10 S DEARBORN ST FL,CHICAGO,IL,60603.0,2017-08-14,Canvass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.881807,-87.629543,"(-87.62954311539407, 41.88180696006542)",0


In [4]:
train.columns

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date',
       'Inspection Type', 'Violations', 'Latitude', 'Longitude', 'Location',
       'Inspection Fail'],
      dtype='object')

In [5]:
for i in range(7):
    train[f'Violation{i}'] = train.Violations.str.split('|').str[i]
    train[f'Violation{i}'] = train[f'Violation{i}'].str[:2]
    train[f'Violation{i}'] = pd.to_numeric(train[f'Violation{i}']).fillna(0)

In [6]:
for i in range(7):
    test[f'Violation{i}'] = test.Violations.str.split('|').str[i]
    test[f'Violation{i}'] = test[f'Violation{i}'].str[:2]
    test[f'Violation{i}'] = pd.to_numeric(test[f'Violation{i}']).fillna(0)

In [7]:
train.columns

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date',
       'Inspection Type', 'Violations', 'Latitude', 'Longitude', 'Location',
       'Inspection Fail', 'Violation0', 'Violation1', 'Violation2',
       'Violation3', 'Violation4', 'Violation5', 'Violation6'],
      dtype='object')

In [8]:
# risk에 대한 feature 변경 
train.loc[(train.Risk =="Risk 1 (High)"), "Risk"] = 3
train.loc[(train.Risk =='Risk 2 (Medium)'), "Risk"] = 2
train.loc[(train.Risk =='Risk 3 (Low)'), "Risk"] = 1
train.loc[(train.Risk =="All"), "Risk"] = 4

test.loc[(train.Risk =="Risk 1 (High)"), "Risk"] = 3
test.loc[(train.Risk =='Risk 2 (Medium)'), "Risk"] = 2
test.loc[(train.Risk =='Risk 3 (Low)'), "Risk"] = 1
test.loc[(train.Risk =="All"), "Risk"] = 4

In [9]:
train.head(2)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Longitude,Location,Inspection Fail,Violation0,Violation1,Violation2,Violation3,Violation4,Violation5,Violation6
0,2050629,MY SWEET STATION INC,MY SWEET STATION,2327223.0,Restaurant,3,2511 N LINCOLN AVE,CHICAGO,IL,60614.0,...,-87.651528,"(-87.65152817242594, 41.92757677830966)",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2078428,OUTTAKES,RED MANGO,2125004.0,Restaurant,2,10 S DEARBORN ST FL,CHICAGO,IL,60603.0,...,-87.629543,"(-87.62954311539407, 41.88180696006542)",0,34.0,3.0,4.0,0.0,0.0,0.0,0.0


In [10]:
train["License #"].isnull().sum()

4

In [11]:
train = train.drop(["Inspection ID", "Zip", "Location"], axis=1)
test = test.drop(["Inspection ID", "Zip", "Location"], axis=1)

In [12]:
from category_encoders import OneHotEncoder # 테스트 목적을 위한 import
from category_encoders import TargetEncoder # 테스트목적을 위한 import
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

target = 'Inspection Fail'
features = train.columns.drop([target])

train, val = train_test_split(train, train_size=0.80, test_size=0.2, random_state=42)

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

X_test = test[features]
y_test = test[target]

In [13]:
from category_encoders import OrdinalEncoder
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train) # 학습데이터
X_val_encoded = encoder.transform(X_val) # 검증데이터

boosting = XGBRegressor(
    n_estimators=1000,
    objective='reg:squarederror', # default
    learning_rate=0.2,
    n_jobs=-1
)

eval_set = [(X_train_encoded, y_train), 
            (X_val_encoded, y_val)]

boosting.fit(X_train_encoded, y_train, 
          eval_set=eval_set,
          early_stopping_rounds=50
         )


[0]	validation_0-rmse:0.42922	validation_1-rmse:0.44614
[1]	validation_0-rmse:0.37537	validation_1-rmse:0.39225
[2]	validation_0-rmse:0.33654	validation_1-rmse:0.35351
[3]	validation_0-rmse:0.30904	validation_1-rmse:0.32614
[4]	validation_0-rmse:0.28862	validation_1-rmse:0.30561
[5]	validation_0-rmse:0.27498	validation_1-rmse:0.29223
[6]	validation_0-rmse:0.26466	validation_1-rmse:0.28230
[7]	validation_0-rmse:0.25712	validation_1-rmse:0.27471
[8]	validation_0-rmse:0.25145	validation_1-rmse:0.26897
[9]	validation_0-rmse:0.24704	validation_1-rmse:0.26607
[10]	validation_0-rmse:0.24423	validation_1-rmse:0.26350
[11]	validation_0-rmse:0.24164	validation_1-rmse:0.26165
[12]	validation_0-rmse:0.23890	validation_1-rmse:0.25910
[13]	validation_0-rmse:0.23683	validation_1-rmse:0.25705
[14]	validation_0-rmse:0.23502	validation_1-rmse:0.25567
[15]	validation_0-rmse:0.23368	validation_1-rmse:0.25465
[16]	validation_0-rmse:0.23265	validation_1-rmse:0.25440
[17]	validation_0-rmse:0.23178	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
y_pred = boosting.predict(X_val_encoded)
print('R^2', r2_score(y_val, y_pred))

R^2 0.603689303540047


In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

pipe_rf_tag = make_pipeline(
    TargetEncoder(), 
    SimpleImputer(), 
    RandomForestClassifier(max_depth=6,
                          criterion='entropy',
                          n_jobs=-1,
                          min_samples_leaf=3,
                          random_state=100)
)

pipe_rf_tag.fit(X_train, y_train)
print('rf 훈련세트 정확도', pipe_rf_tag.score(X_train, y_train))
print('rf 훈련세트 f1 score', f1_score(y_train, pipe_rf_tag.predict(X_train)))

rf 훈련세트 정확도 0.9202291666666667
rf 훈련세트 f1 score 0.7959499067412736


In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import Ridge

pipe_rf_ordi = make_pipeline(
    OrdinalEncoder(), 
    SimpleImputer(), 
    RandomForestClassifier(max_depth=6,
                          criterion='entropy',
                          n_jobs=-1,
                          min_samples_leaf=3,
                          random_state=100)
)

pipe_rf_ordi.fit(X_train, y_train)
print('rf 훈련세트 정확도', pipe_rf_ordi.score(X_train, y_train))
print('rf 훈련세트 f1 score', f1_score(y_train, pipe_rf_ordi.predict(X_train)))

rf 훈련세트 정확도 0.8892916666666667
rf 훈련세트 f1 score 0.6697738006462838


In [17]:
pipe_rf_ordi.fit(X_train, y_train)
print('rf 훈련세트 정확도', pipe_rf_ordi.score(X_train, y_train))
print('rf 훈련세트 f1 score', f1_score(y_train, pipe_rf_ordi.predict(X_train)))

rf 훈련세트 정확도 0.8892916666666667
rf 훈련세트 f1 score 0.6697738006462838


In [18]:
print('pipe_rf_tag 훈련 정확도 : ',pipe_rf_tag.score(X_train, y_train))
print('pipe_rf_tag 검증 정확도 : ',pipe_rf_tag.score(X_val, y_val))
print('pipe_rf_ordi 훈련 정확도 : ',pipe_rf_ordi.score(X_train, y_train))
print('pipe_rf_ordi 검증 정확도 : ',pipe_rf_ordi.score(X_val, y_val))

pipe_rf_tag 훈련 정확도 :  0.9202291666666667
pipe_rf_tag 검증 정확도 :  0.8486666666666667
pipe_rf_ordi 훈련 정확도 :  0.8892916666666667
pipe_rf_ordi 검증 정확도 :  0.8695833333333334


In [19]:
print('pipe_rf_tag 훈련 f1 score : ',f1_score(y_train, pipe_rf_tag.predict(X_train)))
print('pipe_rf_tag 검증 f1 score : ',f1_score(y_val, pipe_rf_tag.predict(X_val)))
print('pipe_rf_ordi 훈련 f1 score : ',f1_score(y_train, pipe_rf_ordi.predict(X_train)))
print('pipe_rf_ordi 검증 f1 score : ',f1_score(y_val, pipe_rf_ordi.predict(X_val)))

pipe_rf_tag 훈련 f1 score :  0.7959499067412736
pipe_rf_tag 검증 f1 score :  0.5551200391964722
pipe_rf_ordi 훈련 f1 score :  0.6697738006462838
pipe_rf_ordi 검증 f1 score :  0.5433323606653049
