# 분류

## 서비스 이탈 예측 데이터

1) 범주형 변수들 더미화
2) train test split
3) 랜덤포레스트 분류기 모델 정의, fit, predict
4) 문제에서 제시한 지표로 성능 확인
5) test 데이터로 predict

In [2]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")


In [5]:
x_train.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15799217,Zetticci,791,Germany,Female,35,7,52436.2,1,1,0,161051.75
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,15780966,Pritchard,709,France,Female,32,2,0.0,2,0,0,109681.29
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73


In [8]:
y_train

Unnamed: 0,CustomerId,Exited
0,15799217,0
1,15748986,0
2,15722004,0
3,15780966,0
4,15636731,0
...,...,...
6494,15702806,0
6495,15674179,0
6496,15790204,1
6497,15690772,0


In [10]:
drop_col = ['CustomerId', 'Surname'] # 더미화하지 않을 변수

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)

x_train_dummies = pd.get_dummies(x_train_drop)
y = y_train['Exited']

x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]  # train데이터와 칼럼 순서 동일하게 하기(오류 방지)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# split
X_train, X_val, y_train, y_val = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

# modeling
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)

# prediction -> 라벨 or 확률 문제에서 제시한대로
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:,1]

predict_val_label = rf.predict(X_val)
predict_val_proba = rf.predict_proba(X_val)[:,1]

In [13]:
# 성능 평가

from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print('train accuracy:', accuracy_score(y_train, predict_train_label))
print('validation accuracy:', accuracy_score(y_val, predict_val_label))

print('\ntrain f1 score:', f1_score(y_train, predict_train_label))
print('validation f1 score:', f1_score(y_val, predict_val_label))

print('\ntrain recall score:', recall_score(y_train, predict_train_label))
print('validation recall score:', recall_score(y_val, predict_val_label))

print('\ntrain precision score:', precision_score(y_train, predict_train_label))
print('validation precision score:', precision_score(y_val, predict_val_label))

print('\ntrain auc:', roc_auc_score(y_train, predict_train_proba))
print('validation auc:', roc_auc_score(y_val, predict_val_proba))

train accuracy: 1.0
validation accuracy: 0.8657342657342657

train f1 score: 1.0
validation f1 score: 0.5920679886685553

train recall score: 1.0
validation recall score: 0.4543478260869565

train precision score: 1.0
validation precision score: 0.8495934959349594

train auc: 1.0
validation auc: 0.8542568700812798


In [14]:
# Test 데이터 predict

predict_test_label = rf.predict(x_test_dummies)
predict_test_proba = rf.predict_proba(x_test_dummies)[:,1]

## 이직여부 판단 데이터

In [1]:
import pandas as pd

X_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv')
X_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv')

In [2]:
X_train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,25298,city_138,0.836,Male,No relevent experience,Full time course,High School,,5,100-500,Pvt Ltd,1,45
1,4241,city_160,0.92,Male,No relevent experience,Full time course,High School,,5,,,1,17
2,24086,city_57,0.866,Male,No relevent experience,no_enrollment,Graduate,STEM,10,,,1,50
3,26773,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,>4,135
4,32325,city_143,0.74,,No relevent experience,Full time course,Graduate,STEM,5,,,never,17


In [3]:
y_train.head()

Unnamed: 0,enrollee_id,target
0,25298,0.0
1,4241,1.0
2,24086,0.0
3,26773,0.0
4,32325,1.0


In [4]:
print(X_train.shape)
print(X_test.shape)

(12452, 13)
(6706, 13)


In [7]:
print(X_train.isnull().sum())  # 결측치 있음..but 처리 x

enrollee_id                  0
city                         0
city_development_index       0
gender                    2917
relevent_experience          0
enrolled_university        257
education_level            315
major_discipline          1866
experience                  37
company_size              3852
company_type              3981
last_new_job               273
training_hours               0
dtype: int64


In [8]:
X_train.nunique()

enrollee_id               12452
city                        122
city_development_index       92
gender                        3
relevent_experience           2
enrolled_university           3
education_level               5
major_discipline              6
experience                   22
company_size                  8
company_type                  6
last_new_job                  6
training_hours              241
dtype: int64

In [13]:
# 범주가 너무 많은 변수는 날림

drop_col = ['enrollee_id', 'city', 'experience']
X_train_drop = X_train.drop(columns = drop_col)
X_test_drop = X_test.drop(columns = drop_col)

# 더미화
X_train_dummies = pd.get_dummies(X_train_drop)
X_test_dummies = pd.get_dummies(X_test_drop)
X_test_dummies = X_test_dummies[X_train_dummies.columns] # 칼럼 순서 맞추기
y_train_target = y_train['target']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_val, y_train, y_val = train_test_split(X_train_dummies, y_train_target, test_size = 0.33, random_state = 42)

rf= RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:,1]

predict_val_label = rf.predict(X_val)
predict_val_proba = rf.predict_proba(X_val)[:,1]

In [17]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print('train accuracy:', accuracy_score(y_train, predict_train_label))
print('validation accuracy:', accuracy_score(y_val, predict_val_label))

print('\ntrain f1 score:', f1_score(y_train, predict_train_label))
print('validation f1 score:', f1_score(y_val, predict_val_label))

print('\ntrain recall score:', recall_score(y_train, predict_train_label))
print('validation recall score:', recall_score(y_val, predict_val_label))

print('\ntrain auc:', roc_auc_score(y_train, predict_train_proba))
print('validation auc:', roc_auc_score(y_val, predict_val_proba))

print('\ntrain precision score:', precision_score(y_train, predict_train_label))
print('validation precision score:', precision_score(y_val, predict_val_label))

train accuracy: 0.9971229920882282
validation accuracy: 0.7652068126520681

train f1 score: 0.9941832283082889
validation f1 score: 0.46714522363335176

train recall score: 0.9937015503875969
validation recall score: 0.40634005763688763

train auc: 0.9999269937347351
validation auc: 0.7511406713786558

train precision score: 0.9946653734238603
validation precision score: 0.5493506493506494


In [None]:
# 제출용

predict_test_label = rf.predict(X_test_dummies)
predict_test_proba = rf.predict_proba(X_test_dummies)[:1]

## 정시 배송 여부 판단

In [1]:
import pandas as pd

X_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv')
X_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv')

In [2]:
X_train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,6045,A,Flight,4,3,266,5,high,F,5,1590
1,44,F,Ship,3,1,174,2,low,M,44,1556
2,7940,F,Road,4,1,154,10,high,M,10,5674
3,1596,F,Ship,4,3,158,3,medium,F,27,1207
4,4395,A,Flight,5,3,175,3,low,M,7,4833


In [5]:
y_train.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
0,6045,0
1,44,1
2,7940,1
3,1596,1
4,4395,1


In [4]:
print(X_train.nunique())
print(X_train.isnull().sum())

ID                     6598
Warehouse_block           5
Mode_of_Shipment          3
Customer_care_calls       6
Customer_rating           5
Cost_of_the_Product     215
Prior_purchases           8
Product_importance        3
Gender                    2
Discount_offered         65
Weight_in_gms          3365
dtype: int64
ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64


In [9]:
drop_col = ['ID']
X_train_drop = X_train.drop(columns = drop_col)
X_test_drop = X_test.drop(columns = drop_col)

X_train_dum = pd.get_dummies(X_train_drop)
X_test_dum = pd.get_dummies(X_test_drop)
X_test_dum = X_test_dum[X_train_dum.columns] # 칼럼 순서 맞추기
y_train_target = y_train['Reached.on.Time_Y.N']

# 더미화 했을 떄 변수 갯수가 같으므로 따로 처리할 필요 X

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_val, y_train, y_val = train_test_split(X_train_dum, y_train_target, test_size = 0.3, random_state = 42)

rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)

predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:,1]

predict_val_label = rf.predict(X_val)
predict_val_proba = rf.predict_proba(X_val)[:,1]

In [13]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

print('train accuracy:', accuracy_score(y_train, predict_train_label))
print('validation accuracy:', accuracy_score(y_val, predict_val_label))

print('\ntrain f1 score:', f1_score(y_train, predict_train_label))
print('validation f1 score:', f1_score(y_val, predict_val_label))

print('\ntrain recall score:', recall_score(y_train, predict_train_label))
print('validation recall score:', recall_score(y_val, predict_val_label))

print('\ntrain precision score:', precision_score(y_train, predict_train_label))
print('validation precision score:', precision_score(y_val, predict_val_label))

print('\ntrain auc:', roc_auc_score(y_train, predict_train_proba))
print('validation auc:', roc_auc_score(y_val, predict_val_proba))

train accuracy: 1.0
validation accuracy: 0.6434343434343435

train f1 score: 1.0
validation f1 score: 0.6752529898804048

train recall score: 1.0
validation recall score: 0.6252129471890971

train precision score: 1.0
validation precision score: 0.734

train auc: 0.9999999999999999
validation auc: 0.726206454149247


# 회귀

## 학생 성적 예측 데이터

In [2]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv")


In [18]:
x_train.head()

Unnamed: 0,StudentID,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
0,1714,GP,F,18,U,GT3,T,4,3,other,...,no,4,3,3,1,1,3,0,14,13
1,1254,GP,F,17,U,GT3,T,4,3,health,...,yes,4,4,3,1,3,4,0,13,15
2,1639,GP,F,16,R,GT3,T,4,4,health,...,no,2,4,4,2,3,4,6,10,11
3,1118,GP,M,16,U,GT3,T,4,4,services,...,no,5,3,3,1,3,5,0,15,13
4,1499,GP,M,19,U,GT3,T,3,2,services,...,yes,4,5,4,1,1,4,0,5,0


In [17]:
y_train

Unnamed: 0,StudentID,G3
0,1714,14
1,1254,15
2,1639,11
3,1118,13
4,1499,0
...,...,...
673,1074,14
674,1044,11
675,1078,13
676,1055,10


In [None]:
drop_col = ['StudentID']

X_train_drop = x_train.drop(columns = drop_col)
X_test_drop = x_test.drop(columns = drop_col)
y_train_target = y_train['G3']

X_train_dum = pd.get_dummies(X_train_drop)
X_test_dum = pd.get_dummies(X_test_drop)
X_test_dum = X_test_dum[X_train_dum.columns]   # 칼럼 순서 맞추기

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_val, y_train, y_val = train_test_split(X_train_dum, y_train_target, random_state = 42)

rf = RandomForestRegressor(random_state = 42)
rf.fit(X_train, y_train)

y_val_pred = rf.predict(X_val)

In [10]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

# rmse는 패키지 따로 없으므로 numpy import 한 후 np.sqrt(mean_squared_error) 해줘야함

print('validation MSE:', mean_squared_error(y_val, y_val_pred))
print('\nvalidation MAE:', mean_absolute_error(y_val, y_val_pred))
print('\nvalidation MAPE:', mean_absolute_percentage_error(y_val, y_val_pred))
print('\nvalidation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))
print('\nvalidation r2 score:', r2_score(y_val, y_val_pred))

validation MSE: 1.2731176470588235

validation MAE: 0.8254117647058823

validation MAPE: 395786931958324.8

validation RMSE: 1.1283251513011767

validation r2 score: 0.9085269833850866


## 중고차 가격 예측 데이터

In [13]:
import pandas as pd

X_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv')
X_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv')


In [14]:
X_train.head()

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2
1,17314,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0
2,12342,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9
3,13426,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0
4,16004,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0


In [11]:
X_test.head()

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,12000,merc,GLS Class,2017,Automatic,12046,Diesel,150.0,37.2,3.0
1,12001,vw,Amarok,2017,Automatic,37683,Diesel,260.0,36.2,3.0
2,12004,merc,GLS Class,2019,Automatic,10000,Diesel,145.0,34.0,3.0
3,12013,skoda,Scala,2019,Manual,3257,Petrol,145.0,49.6,1.0
4,12017,audi,RS6,2015,Semi-Auto,20982,Petrol,325.0,29.4,4.0


In [3]:
y_train

Unnamed: 0,carID,price
0,13207,31995
1,17314,7700
2,12342,58990
3,13426,12999
4,16004,16990
...,...,...
4955,16898,17999
4956,14416,28900
4957,15453,8998
4958,14666,23198


In [16]:
drop_col = ['carID']
X_train_drop = X_train.drop(columns = drop_col)
X_test_drop = X_test.drop(columns = drop_col)
y_train_target = y_train['price']

In [None]:
X_train_dum = pd.get_dummies(X_train_drop)
X_test_dum = pd.get_dummies(X_test_drop)

print(len(X_train_dum.columns))
print(len(X_test_dum.columns))

# 칼럼 개수가 다른 이슈,,,

113
112


칼럼 개수가 다를 때:
- 정석 방법:
  - train에 있고 test에 없는 경우: test에 같은 칼럼을 만들고 0으로 채우기
  - train에 없고 test에 있는 경우: train에서 삭제
- 빅분기에서 쓸 수 있는 방법:
  - train과 test 데이터를 concat -> 더미 생성 -> 데이터 분리

In [None]:
# 정석 방법

list(set(X_train_dum.columns)-set(X_test_dum.columns)) # 없는 칼럼 확인

['model_ M6']

In [None]:
X_test_dum['model_M6'] = 0   # 칼럼 만들어주기

X_test_dum = X_test_dum[X_train_dum.columns]  # 칼럼 순서 맞춰주기

113

In [None]:
# 빅분기에서 쓸 수 있는 방법

combined = pd.concat([X_train_drop, X_test_drop])
combined_dum = pd.get_dummies(combined)

X_train_dum = combined_dum[:len(X_train_drop)]
X_test_dum = combined_dum[len(X_train_drop):]

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_dum, y_train_target)

In [28]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)
rf.fit(X_train, y_train)

y_val_pred = rf.predict(X_val)

In [30]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

print('validation MSE:', mean_squared_error(y_val, y_val_pred))
print('\nvalidation MAE:', mean_absolute_error(y_val, y_val_pred))
print('\nvalidation MAPE:', mean_absolute_percentage_error(y_val, y_val_pred))
print('\nvalidation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))
print('\nvalidation R2 score:', r2_score(y_val, y_val_pred))

validation MSE: 10983794.970905306

validation MAE: 1903.4307805235535

validation MAPE: 0.08665934180564046

validation RMSE: 3314.180889888979

validation R2 score: 0.9575510687991071


## 의료 비용 예측 데이터