### ⚠️ roc_auc_score는 ROC 곡선의 아래 면적(AUC) 을 계산 -> ROC 곡선은 분류 확률의 연속적인 변화에 따라 만들어짐 -> 0, 1로 분류하는 predict가 아니라 각 클래스의 확률값을 계산하는 predict_proba 사용해야 함 ⚠️

# 분류

## 서비스 이탈예측 데이터
데이터 설명 : 고객의 신상정보 데이터를 통한 회사 서비스 이탈 예측 (종속변수 : Exited)  
x_train : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv  
y_train : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv  
x_test : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv  
데이터 출처 : https://www.kaggle.com/shubh0799/churn-modelling 에서 변형

In [1]:
import pandas as pd
#데이터 로드
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15799217,Zetticci,791,Germany,Female,35,7,52436.2,1,1,0,161051.75
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,15780966,Pritchard,709,France,Female,32,2,0.0,2,0,0,109681.29
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73


Unnamed: 0,CustomerId,Exited
0,15799217,0
1,15748986,0
2,15722004,0
3,15780966,0
4,15636731,0


In [2]:
drop_col = ['CustomerId','Surname']
x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
display(x_train_drop.head(), x_test_drop.head())

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,791,Germany,Female,35,7,52436.2,1,1,0,161051.75
1,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,709,France,Female,32,2,0.0,2,0,0,109681.29
4,714,Germany,Female,36,1,101609.01,2,1,1,447.73


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,802,France,Female,60,3,92887.06,1,1,0,39473.63
1,602,France,Female,56,3,115895.22,3,1,0,4176.17
2,801,France,Female,32,4,75170.54,1,1,1,37898.5
3,693,Spain,Female,34,10,107556.06,2,0,0,154631.35
4,592,France,Female,62,5,0.0,1,1,1,100941.57


In [3]:
x_train_dummies = pd.get_dummies(x_train_drop)
y = y_train["Exited"]

x_test_dummies = pd.get_dummies(x_test_drop)
# train과 컬럼 순서 동일하게 하기 (더미화 하면서 순서대로 정렬을 이미 하기 때문에 오류가 난다면 해당 컬럼이 누락된것)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_validation, Y_train, Y_validation = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

rf = RandomForestClassifier(random_state = 23)
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [5]:
from sklearn.metrics import accuracy_score , f1_score, recall_score, roc_auc_score ,precision_score

predict_train_label = rf.predict(X_train) # 각각 모델의 최종 예측값 출력 -> 분류 모델 : 0,1,2,3 등 가장 확률 높은 클래스 출력
predict_train_proba = rf.predict_proba(X_train)[:,1] # 두 번째 클래스(즉, positive class) 에 대한 확률만 1차원 배열로 추출

predict_validation_label = rf.predict(X_validation)
predict_validation_prob = rf.predict_proba(X_validation)[:,1]

In [6]:
# 문제에서 묻는 것에 따라 모델 성능 확인하기
# 정확도 (accuracy) , f1_score , recall , precision -> model.predict로 결과뽑기
# auc , 확률이라는 표현있으면 model.predict_proba로 결과뽑고 첫번째 행의 값을 가져오기 model.predict_proba()[:,1]
print('train accuracy :', accuracy_score(Y_train,predict_train_label))
print('validation accuracy :', accuracy_score(Y_validation,predict_validation_label))
print('\n')

print('train f1_score :', f1_score(Y_train,predict_train_label))
print('validation accuracy :', f1_score(Y_validation,predict_validation_label))

print('\n')
print('train recall_score :', recall_score(Y_train,predict_train_label))
print('validation recall_score :', recall_score(Y_validation,predict_validation_label))

print('\n')
print('train precision_score :', precision_score(Y_train,predict_train_label))
print('validation precision_score :', precision_score(Y_validation,predict_validation_label))

print('\n')
print('train auc :', roc_auc_score(Y_train,predict_train_proba)) # roc_auc_score는 predict_proba로 0, 1만이 아니라 각 클래스에 대한 확률로 계산
print('validation auc :', roc_auc_score(Y_validation,predict_validation_prob))

train accuracy : 1.0
validation accuracy : 0.8652680652680653


train f1_score : 1.0
validation accuracy : 0.5912305516265912


train recall_score : 1.0
validation recall_score : 0.4543478260869565


train precision_score : 1.0
validation precision_score : 0.8461538461538461


train auc : 1.0
validation auc : 0.8497613211198555


In [7]:
# test데이터 마찬가지 위와 같은 방식
predict_test_label = rf.predict(x_test_dummies)
predict_test_proba = rf.predict_proba(x_test_dummies)[:,1]

## 이직여부 판단 데이터
데이터 설명 : 이직여부 판단 데이터 (target: 1: 이직 , 0 : 이직 x)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_test.csv  
데이터 출처 :https://www.kaggle.com/datasets/arashnic/hr-analytics-job-change-of-data-scientists (참고, 데이터 수정)

In [8]:
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,25298,city_138,0.836,Male,No relevent experience,Full time course,High School,,5,100-500,Pvt Ltd,1,45
1,4241,city_160,0.92,Male,No relevent experience,Full time course,High School,,5,,,1,17
2,24086,city_57,0.866,Male,No relevent experience,no_enrollment,Graduate,STEM,10,,,1,50
3,26773,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,>4,135
4,32325,city_143,0.74,,No relevent experience,Full time course,Graduate,STEM,5,,,never,17


Unnamed: 0,enrollee_id,target
0,25298,0.0
1,4241,1.0
2,24086,0.0
3,26773,0.0
4,32325,1.0


In [9]:
print(x_train.info())
print(x_train.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12452 entries, 0 to 12451
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             12452 non-null  int64  
 1   city                    12452 non-null  object 
 2   city_development_index  12452 non-null  float64
 3   gender                  9535 non-null   object 
 4   relevent_experience     12452 non-null  object 
 5   enrolled_university     12195 non-null  object 
 6   education_level         12137 non-null  object 
 7   major_discipline        10586 non-null  object 
 8   experience              12415 non-null  object 
 9   company_size            8600 non-null   object 
 10  company_type            8471 non-null   object 
 11  last_new_job            12179 non-null  object 
 12  training_hours          12452 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 1.2+ MB
None
enrollee_id                 

In [10]:
drop_col = ["enrollee_id", "city", "company_type", "experience"]
x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["target"]

In [11]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_validation, Y_train, Y_validation = train_test_split(x_train_dummies, y, test_size =  0.33, random_state = 42)
rf = RandomForestClassifier(random_state = 23)
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:,1]

predict_validation_label = rf.predict(X_validation)
predict_validation_prob = rf.predict_proba(X_validation)[:,1]

In [14]:
from sklearn.metrics import accuracy_score , f1_score, recall_score, roc_auc_score ,precision_score
print('train accuracy :', accuracy_score(Y_train,predict_train_label))
print('validation accuracy :', accuracy_score(Y_validation,predict_validation_label))
print('\n')

print('train f1_score :', f1_score(Y_train,predict_train_label))
print('validation f1_score :', f1_score(Y_validation,predict_validation_label))
print('\n')

print('train recall_score :', recall_score(Y_train,predict_train_label))
print('validation recall_score :', recall_score(Y_validation,predict_validation_label))
print('\n')

print('train precision_score :', precision_score(Y_train,predict_train_label))
print('validation precision_score :', precision_score(Y_validation,predict_validation_label))
print('\n')

print('train auc :', roc_auc_score(Y_train,predict_train_proba))
print('validation auc :', roc_auc_score(Y_validation,predict_validation_prob))

train accuracy : 0.9965236154399425
validation accuracy : 0.7535279805352798


train f1_score : 0.9929731039496001
validation f1_score : 0.42736009044657997


train recall_score : 0.9927325581395349
validation recall_score : 0.3631123919308357


train precision_score : 0.9932137663596704
validation precision_score : 0.5192307692307693


train auc : 0.9998907607098495
validation auc : 0.740677513569584


In [15]:
predict_test_label = rf.predict(x_test_dummies)
predict_test_proba = rf.predict_proba(x_test_dummies)[:,1]

## 정시 배송 여부 판단 (2회기출)
데이터 설명 : e-commerce 배송의 정시 도착여부 (1: 정시배송 0 : 정시미배송)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv  
데이터 출처 :https://www.kaggle.com/datasets/prachi13/customer-analytics (참고, 데이터 수정)

In [16]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,6045,A,Flight,4,3,266,5,high,F,5,1590
1,44,F,Ship,3,1,174,2,low,M,44,1556
2,7940,F,Road,4,1,154,10,high,M,10,5674
3,1596,F,Ship,4,3,158,3,medium,F,27,1207
4,4395,A,Flight,5,3,175,3,low,M,7,4833


Unnamed: 0,ID,Reached.on.Time_Y.N
0,6045,0
1,44,1
2,7940,1
3,1596,1
4,4395,1


In [17]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train.iloc[:,1]

In [18]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_validation, Y_train, Y_validation = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)
rf = RandomForestClassifier(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
# model score
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_validation_label = rf.predict(X_validation)
predict_validation_proba = rf.predict_proba(X_validation)[:, 1]

In [21]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy : ", accuracy_score(Y_train, predict_train_label))
print("validation accurace : ", accuracy_score(Y_validation, predict_validation_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("validation f1_score : ", f1_score(Y_validation, predict_validation_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("validation recall_score : ", recall_score(Y_validation, predict_validation_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("validation roc_auc_score : ", roc_auc_score(Y_validation, predict_validation_proba))
print("\n")

print("train precision : ", precision_score(Y_train, predict_train_label))
print("validation precision : ", precision_score(Y_validation, predict_validation_label))

train accuracy :  1.0
validation accurace :  0.6395775941230487


train f1_score :  1.0
validation f1_score :  0.6744089589382


train recall_score :  1.0
validation recall_score :  0.630721489526765


train roc_auc_score :  1.0
validation roc_auc_score :  0.7261997118475008


train precision :  1.0
validation precision :  0.7245989304812834


## 성인 건강검진 데이터
데이터 설명 : 2018년도 성인의 건강검 진데이터 (종속변수 : 흡연상태 1- 흡연, 0-비흡연 )  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/x_test.csv x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/y_test.csv  
데이터 출처 :https://www.data.go.kr/data/15007122/fileData.do (참고, 데이터 수정)

In [22]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/x_test.csv")


display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,성별코드,연령대코드(5세단위),신장(5Cm단위),체중(5Kg단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),...,LDL콜레스테롤,혈색소,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마지티피,구강검진수검여부,치아우식증유무,치석
0,0,F,40,155,60,81.3,1.2,1.0,1.0,1.0,...,126.0,12.9,1.0,0.7,18.0,19.0,27.0,Y,0.0,Y
1,1,F,40,160,60,81.0,0.8,0.6,1.0,1.0,...,127.0,12.7,1.0,0.6,22.0,19.0,18.0,Y,0.0,Y
2,2,M,55,170,60,80.0,0.8,0.8,1.0,1.0,...,151.0,15.8,1.0,1.0,21.0,16.0,22.0,Y,0.0,N
3,3,M,40,165,70,88.0,1.5,1.5,1.0,1.0,...,226.0,14.7,1.0,1.0,19.0,26.0,18.0,Y,0.0,Y
4,4,F,40,155,60,86.0,1.0,1.0,1.0,1.0,...,107.0,12.5,1.0,0.6,16.0,14.0,22.0,Y,0.0,N


Unnamed: 0,ID,흡연상태
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0


In [23]:
drop_col = ["ID", "구강검진수검여부"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train['흡연상태']

In [24]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42, stratify = y)
# stratify = y : 훈련 데이터와 검증 데이터가 원래 데이터의 클래스 비율을 그대로 유지하도록 층화추출
rf = RandomForestClassifier(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_test_label = rf.predict(X_valid)
predict_test_proba = rf.predict_proba(X_valid)[:, 1]

In [27]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
print("train accuracy : ", accuracy_score(Y_train, predict_train_label))
print("test accuracy : ", accuracy_score(Y_valid, predict_test_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("test f1_score : ", f1_score(Y_valid, predict_test_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("test recall_score : ", recall_score(Y_valid, predict_test_label))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label))
print("test precision_score : ", precision_score(Y_valid, predict_test_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("test roc_auc_score : ", roc_auc_score(Y_valid, predict_test_proba))

train accuracy :  1.0
test accuracy :  0.75637624974495


train f1_score :  1.0
test f1_score :  0.6791472590469366


train recall_score :  1.0
test recall_score :  0.7025574499629355


train precision_score :  1.0
test precision_score :  0.657246879334258


train roc_auc_score :  1.0
test roc_auc_score :  0.834807387299372


## 자동차 보험가입 예측데이터
데이터 설명 : 자동차 보험 가입 예측 (종속변수 Response: 1 : 가입 , 0 :미가입)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/y_test.csv  
데이터 출처 :https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction(참고, 데이터 수정)근처 자동차 대리점

In [28]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,id
0,0,Female,23,1,8.0,0,< 1 Year,Yes,61354.0,152.0,235,
1,1,Male,27,1,28.0,1,< 1 Year,No,38036.0,152.0,207,
2,2,Female,23,1,45.0,0,< 1 Year,Yes,25984.0,152.0,217,
3,3,Male,22,1,46.0,0,< 1 Year,No,39499.0,152.0,277,
4,4,Male,32,1,30.0,1,< 1 Year,No,38771.0,152.0,251,


Unnamed: 0,ID,Response
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0


In [29]:
drop_col = ["ID", "id"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train.iloc[:,1]

In [30]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42, stratify = y)
rf = RandomForestClassifier(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_test_label = rf.predict(X_valid)
predict_test_proba = rf.predict_proba(X_valid)[:, 1]

In [33]:
from sklearn.metrics import accuracy_score , f1_score, recall_score, roc_auc_score ,precision_score

print("train accuracy : ", accuracy_score(Y_train, predict_train_label))
print("test accuracy : ", accuracy_score(Y_valid, predict_test_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("test f1_score : ", f1_score(Y_valid, predict_test_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("test recall_score : ", recall_score(Y_valid, predict_test_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("test roc_auc_score : ", roc_auc_score(Y_valid, predict_test_proba))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label))
print("test precision_score : ", precision_score(Y_valid, predict_test_label))

train accuracy :  0.9999314646014666
test accuracy :  0.8655143967479352


train f1_score :  0.9997203691127712
test f1_score :  0.18118003025718607


train recall_score :  0.9995606502376483
test recall_score :  0.12140134620063255


train roc_auc_score :  0.9999998704194665
test roc_auc_score :  0.8340167689531695


train precision_score :  0.9998801390387151
test precision_score :  0.3569384835479256


## 비행탑승 경험 만족도 데이터
데이터 설명 : 비행탑승 경험 만족도 (satisfaction 컬럼 : ‘neutral or dissatisfied’ or satisfied ) (83123, 24) shape  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/y_test.csv  
데이터 출처 :https://www.kaggle.com/teejmahal20/airline-passenger-satisfaction?select=train.csv (참고, 데이터 수정)

### test 데이터에 대해서 neutral or dissatisfied라고 예측할 확률을 구하고 그 확률 값을 제출하라

In [34]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,id
0,0,Female,Loyal Customer,54,Personal Travel,Eco,1068,3,4,3,...,5,5,3,5,3,5,3,47,22.0,
1,2,Male,Loyal Customer,20,Personal Travel,Eco,1546,4,4,4,...,4,3,3,4,4,4,4,5,2.0,
2,3,Male,Loyal Customer,59,Business travel,Business,2962,0,4,0,...,1,1,1,1,5,1,4,54,46.0,
3,4,Male,Loyal Customer,35,Business travel,Eco Plus,106,5,4,4,...,5,2,1,5,4,4,5,130,121.0,
4,5,Female,Loyal Customer,9,Business travel,Business,2917,3,3,3,...,4,4,4,5,4,3,4,0,0.0,


Unnamed: 0,ID,satisfaction
0,0,neutral or dissatisfied
1,2,neutral or dissatisfied
2,3,satisfied
3,4,satisfied
4,5,satisfied


In [35]:
drop_col = ["ID", "id"]
x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train.iloc[:, 1]

In [36]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

rf = RandomForestClassifier(random_state = 23)
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_test_label = rf.predict(X_valid)
predict_test_proba = rf.predict_proba(X_valid)[:, 1]

In [39]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy_score : ", accuracy_score(Y_train, predict_train_label))
print("test accuracy_score : ", accuracy_score(Y_valid, predict_test_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label, pos_label = "neutral or dissatisfied")) # pos_label : f1_score에서 양성이라고 할 항목 지정
print("test f1_score : ", f1_score(Y_valid, predict_test_label, pos_label = "neutral or dissatisfied"))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label, pos_label = "neutral or dissatisfied"))
print("test recall_score : ", recall_score(Y_valid, predict_test_label, pos_label = "neutral or dissatisfied"))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("test roc_auc_score : ", roc_auc_score(Y_valid, predict_test_proba))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label, pos_label = "neutral or dissatisfied"))
print("test precision_score : ", precision_score(Y_valid, predict_test_label, pos_label = "neutral or dissatisfied"))

train accuracy_score :  1.0
test accuracy_score :  0.9598629288031789


train f1_score :  1.0
test f1_score :  0.9652033753674031


train recall_score :  1.0
test recall_score :  0.9773425499231951


train roc_auc_score :  1.0
test roc_auc_score :  0.9930296167395469


train precision_score :  1.0
test precision_score :  0.95336205281888


## 수질 음용성 여부 데이터
데이터 설명 : 수질 음용성 여부 (Potablillity 컬럼 : 0 ,1 )  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/y_test.csv  
데이터 출처 :https://www.kaggle.com/adityakadiwal/water-potability

In [40]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,0,8.66271,173.531947,20333.079495,5.636388,439.787938,459.63312,16.283311,89.924253,5.120103
1,1,,226.270824,15380.124079,6.661474,,392.558205,14.08311,50.286395,4.51687
2,2,7.58377,217.283262,36343.407055,8.532726,375.964391,393.877683,17.442301,77.722257,3.642289
3,3,6.584813,182.375456,24723.106296,6.23892,,414.350751,17.582615,78.213738,4.404132
4,4,7.179864,180.854211,10859.553752,8.263503,341.302486,358.056264,12.065317,83.329918,3.878447


Unnamed: 0,ID,Potability
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0


In [41]:
x_train.isnull().sum()

ID                   0
ph                 395
Hardness             0
Solids               0
Chloramines          0
Sulfate            617
Conductivity         0
Organic_carbon       0
Trihalomethanes    132
Turbidity            0
dtype: int64

In [42]:
for col in x_train.isnull().sum().where(lambda x : x !=0).dropna().index:
    mean_values = x_train[col].mean()
    x_train[col] = x_train[col].fillna(mean_values)

    # data leakage 때문에 결측치는 train값으로 채우는게 원칙이나 신경쓰기 어렵다면 test의 결측치는 test의 평균값으로 대치하세요
    x_test[col] = x_test[col].fillna(mean_values)

In [43]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["Potability"]

In [44]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42, stratify = y)

rf = RandomForestClassifier(random_state = 23)
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_valid_label = rf.predict(X_valid)
predict_valid_proba = rf.predict_proba(X_valid)[:, 1]

In [47]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy_score : ", accuracy_score(Y_train, predict_train_label))
print("validation accuracy_score : ", accuracy_score(Y_valid, predict_valid_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("validation f1_score : ", f1_score(Y_valid, predict_valid_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("validation recall_score : ", recall_score(Y_valid, predict_valid_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("validation roc_auc_score : ", roc_auc_score(Y_valid, predict_valid_proba))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label))
print("validation precision_score : ", precision_score(Y_valid, predict_valid_label))

train accuracy_score :  1.0
validation accuracy_score :  0.6497109826589595


train f1_score :  1.0
validation f1_score :  0.4116504854368932


train recall_score :  1.0
validation recall_score :  0.314540059347181


train roc_auc_score :  1.0
validation roc_auc_score :  0.6309515780954951


train precision_score :  1.0
validation precision_score :  0.5955056179775281


## 약물 분류 데이터
데이터 설명 : 투약하는 약을 분류 (종속변수 :Drug)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/y_test.csv  
데이터 출처 :https://www.kaggle.com/prathamtripathi/drug-classification(참고, 데이터 수정)

In [48]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/drug/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,Age,Sex,BP,Cholesterol,Na_to_K
0,0,36,F,NORMAL,HIGH,16.753
1,1,47,F,LOW,HIGH,11.767
2,2,69,F,NORMAL,HIGH,10.065
3,3,35,M,LOW,NORMAL,9.17
4,4,49,M,LOW,NORMAL,11.014


Unnamed: 0,ID,Drug
0,0,0
1,1,3
2,2,4
3,3,4
4,4,4


In [49]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["Drug"]

In [50]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42, stratify = y)

rf = RandomForestClassifier(random_state = 23)
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [52]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train) # [:, 1]을 안하는 이유 : 다중 클래스에 대한 proba를 구해야 하기 때문에

predict_valid_label = rf.predict(X_valid)
predict_valid_proba = rf.predict_proba(X_valid) # [:, 1]을 안하는 이유 : 다중 클래스에 대한 proba를 구해야 하기 때문에

In [53]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy_score : ", accuracy_score(Y_train, predict_train_label))
print("validation accuracy_score : ", accuracy_score(Y_valid, predict_valid_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label, average = "macro")) # average : f1을 구할 때 다중 클래스에서 각 클래스의 F1을 구해 단순 평균
print("validation f1_score : ", f1_score(Y_valid, predict_valid_label, average = "macro"))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label, average = "macro"))
print("validation recall_score : ", recall_score(Y_valid, predict_valid_label, average = "macro"))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba, multi_class='ovr')) # multi_class : 다중 클래스에서 roc_auc_score를 구할 떄 각 클래스를 positive vs 나머지 전체로 비교
print("validation roc_auc_score : ", roc_auc_score(Y_valid, predict_valid_proba, multi_class='ovr'))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label, average = "macro"))
print("validation precision_score : ", precision_score(Y_valid, predict_valid_label, average = "macro"))

train accuracy_score :  1.0
validation accuracy_score :  0.9807692307692307


train f1_score :  1.0
validation f1_score :  0.956043956043956


train recall_score :  1.0
validation recall_score :  0.95


train roc_auc_score :  1.0
validation roc_auc_score :  0.9947010869565217


train precision_score :  1.0
validation precision_score :  0.9714285714285715


## 사기회사 분류 데이터
데이터 설명 : 사기회사 분류 (종속변수 : Risk 1: 사기 , 0 : 정상)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/y_test.csv  
데이터 출처 :https://www.kaggle.com/sid321axn/audit-data(참고, 데이터 수정)

In [54]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/audit/x_test.csv")

display(x_train.head())
display(y_train.head())


Unnamed: 0,ID,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,...,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk
0,0,2.37,16,0.01,0.2,0.002,0.007,0.2,0.0014,0.017,...,0.2,0.4,0,0.2,0.0,2.0,1.4034,0.4,0.5,0.28068
1,2,55.57,9,1.06,0.4,0.424,0.0,0.2,0.0,1.06,...,0.2,0.4,0,0.2,0.0,2.2,1.824,0.4,0.5,0.3648
2,3,55.57,16,2.42,0.6,1.452,3.53,0.6,2.118,5.95,...,0.2,0.4,0,0.2,0.0,3.8,7.494,0.4,0.5,1.4988
3,4,2.37,9,0.31,0.2,0.062,0.69,0.2,0.138,1.0,...,0.2,0.4,0,0.2,0.0,2.0,1.6,0.4,0.5,0.32
4,5,55.57,6,0.62,0.2,0.124,0.42,0.2,0.084,1.04,...,0.2,0.4,0,0.2,0.0,2.0,1.608,0.4,0.5,0.3216


Unnamed: 0,ID,Risk
0,0,0
1,2,0
2,3,1
3,4,0
4,5,0


In [55]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["Risk"]

In [56]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies.reindex(columns=x_train_dummies.columns, fill_value=0)

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 23)

In [58]:
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [59]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_valid_label = rf.predict(X_valid)
predict_valid_proba = rf.predict_proba(X_valid)[:, 1]

In [60]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy_score : ", accuracy_score(Y_train, predict_train_label))
print("validation accuracy_score : ", accuracy_score(Y_valid, predict_valid_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("validation f1_score : ", f1_score(Y_valid, predict_valid_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("validation recall_score : ", recall_score(Y_valid, predict_valid_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("validation roc_auc_score : ", roc_auc_score(Y_valid, predict_valid_proba))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label))
print("validation precision_score : ", precision_score(Y_valid, predict_valid_label))

train accuracy_score :  1.0
validation accuracy_score :  1.0


train f1_score :  1.0
validation f1_score :  1.0


train recall_score :  1.0
validation recall_score :  1.0


train roc_auc_score :  1.0
validation roc_auc_score :  1.0


train precision_score :  1.0
validation precision_score :  1.0


## 센서데이터 동작유형 분류 데이터
데이터 설명 : 센서데이터로 동작 유형 분류 (종속변수 pose : 0 ,1 구분)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/y_test.csv  
데이터 출처 :https://www.kaggle.com/kyr7plus/emg-4(참고, 데이터 수정)

In [61]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/muscle/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,motion_0,motion_1,motion_2,motion_3,motion_4,motion_5,motion_6,motion_7,motion_8,...,motion_54,motion_55,motion_56,motion_57,motion_58,motion_59,motion_60,motion_61,motion_62,motion_63
0,0,1.0,-2.0,-1.0,4.0,-5.0,-4.0,1.0,0.0,-15.0,...,0.0,-1.0,-13.0,-3.0,1.0,-1.0,-32.0,-22.0,-2.0,-3.0
1,2,20.0,0.0,0.0,1.0,5.0,6.0,-52.0,18.0,15.0,...,-70.0,-55.0,-38.0,-14.0,-12.0,-8.0,-34.0,-63.0,-87.0,-77.0
2,4,1.0,-1.0,1.0,4.0,-5.0,-8.0,1.0,-3.0,-14.0,...,1.0,12.0,-25.0,0.0,0.0,3.0,2.0,-27.0,1.0,0.0
3,5,13.0,2.0,1.0,-3.0,1.0,3.0,28.0,3.0,12.0,...,0.0,-21.0,-17.0,-2.0,0.0,-4.0,-17.0,-21.0,-21.0,25.0
4,6,-2.0,-7.0,-4.0,-8.0,16.0,44.0,1.0,3.0,-16.0,...,-1.0,2.0,-1.0,1.0,4.0,4.0,-17.0,-38.0,-3.0,3.0


Unnamed: 0,ID,pose
0,0,1
1,2,0
2,4,1
3,5,0
4,6,1


In [62]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["pose"]

In [63]:
x_test_drop = x_test_drop[x_train_drop.columns]

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_drop, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [65]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_valid_label = rf.predict(X_valid)
predict_valid_proba = rf.predict_proba(X_valid)[:, 1]

In [66]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy_score : ", accuracy_score(Y_train, predict_train_label))
print("test accuracy_score : ", accuracy_score(Y_valid, predict_valid_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("test f1_score : ", f1_score(Y_valid, predict_valid_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("test recall_score : ", recall_score(Y_valid, predict_valid_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("test roc_auc_score : ", roc_auc_score(Y_valid, predict_valid_proba))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label))
print("test precision_score : ", precision_score(Y_valid, predict_valid_label))

train accuracy_score :  1.0
test accuracy_score :  0.9941368078175896


train f1_score :  1.0
test f1_score :  0.9941137998691956


train recall_score :  1.0
test recall_score :  0.9908735332464146


train roc_auc_score :  1.0
test roc_auc_score :  0.9999244553998261


train precision_score :  1.0
test precision_score :  0.9973753280839895


## 당뇨여부판단 데이터
데이터 설명 : 당뇨여부 판단하기 (종속변수 Outcome : 1 당뇨 , 0 :정상)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/y_test.csv  
데이터 출처 :https://www.kaggle.com/pritsheta/diabetes-dataset(참고, 데이터 수정)

In [67]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/diabetes/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,8,126,88,36,108,38.5,0.349,49
1,1,0,74,52,10,36,27.8,0.269,22
2,2,1,140,74,26,180,24.1,0.828,23
3,3,6,162,62,0,0,24.3,0.178,50
4,4,2,94,68,18,76,26.0,0.561,21


Unnamed: 0,ID,Outcome
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0


In [68]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["Outcome"]

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_drop, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [70]:
predict_train_label = rf.predict(X_train)
predict_train_proba = rf.predict_proba(X_train)[:, 1]

predict_valid_label = rf.predict(X_valid)
predict_valid_proba = rf.predict_proba(X_valid)[:, 1]

In [71]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score

print("train accuracy_score : ", accuracy_score(Y_train, predict_train_label))
print("test accuracy_score : ", accuracy_score(Y_valid, predict_valid_label))
print("\n")

print("train f1_score : ", f1_score(Y_train, predict_train_label))
print("test f1_score : ", f1_score(Y_valid, predict_valid_label))
print("\n")

print("train recall_score : ", recall_score(Y_train, predict_train_label))
print("test recall_score : ", recall_score(Y_valid, predict_valid_label))
print("\n")

print("train roc_auc_score : ", roc_auc_score(Y_train, predict_train_proba))
print("test roc_auc_score : ", roc_auc_score(Y_valid, predict_valid_proba))
print("\n")

print("train precision_score : ", precision_score(Y_train, predict_train_label))
print("test precision_score : ", precision_score(Y_valid, predict_valid_label))

train accuracy_score :  1.0
test accuracy_score :  0.7733990147783252


train f1_score :  1.0
test f1_score :  0.6349206349206349


train recall_score :  1.0
test recall_score :  0.6557377049180327


train roc_auc_score :  1.0
test roc_auc_score :  0.8227314707919648


train precision_score :  1.0
test precision_score :  0.6153846153846154


# 회귀

## 학생성적 예측 데이터
데이터 설명 : 학생성적 예측 (종속변수 :G3)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv  
데이터 출처 :https://www.kaggle.com/datasets/ishandutta/student-performance-data-set (참고, 데이터 수정)

In [72]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,StudentID,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
0,1714,GP,F,18,U,GT3,T,4,3,other,...,no,4,3,3,1,1,3,0,14,13
1,1254,GP,F,17,U,GT3,T,4,3,health,...,yes,4,4,3,1,3,4,0,13,15
2,1639,GP,F,16,R,GT3,T,4,4,health,...,no,2,4,4,2,3,4,6,10,11
3,1118,GP,M,16,U,GT3,T,4,4,services,...,no,5,3,3,1,3,5,0,15,13
4,1499,GP,M,19,U,GT3,T,3,2,services,...,yes,4,5,4,1,1,4,0,5,0


Unnamed: 0,StudentID,G3
0,1714,14
1,1254,15
2,1639,11
3,1118,13
4,1499,0


In [73]:
drop_col = ['StudentID']

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train['G3']

In [74]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [76]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_valid)

In [77]:
from sklearn.metrics import mean_squared_error ,mean_absolute_error ,mean_absolute_percentage_error, r2_score
import numpy as np

#mse 
print('train mse : ', mean_squared_error(Y_train, y_train_pred))
print('validation mse : ', mean_squared_error(Y_valid, y_valid_pred))

#mae 
print('train mae : ', mean_absolute_error(Y_train, y_train_pred))
print('validation mae : ', mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

#mape 
print('train mape : ', mean_absolute_percentage_error(Y_train, y_train_pred))
print('validation mape : ', mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

#rmse
print('train rmse : ', np.sqrt(mean_squared_error(Y_train, y_train_pred)))
print('validation rmse : ', np.sqrt(mean_squared_error(Y_valid, y_valid_pred)))
print("\n")

#r2
print('train r2 score : ', r2_score(Y_train, y_train_pred))
print('validation r2 score : ', r2_score(Y_valid, y_valid_pred))

train mse :  0.2790762114537445
validation mse :  2.3451187500000006
train mae :  0.340396475770925
validation mae :  0.9733035714285715


train mape :  262081722808653.12
validation mape :  516104475154467.1


train rmse :  0.5282766429189771
validation rmse :  1.5313780558699412


train r2 score :  0.9815997199112519
validation r2 score :  0.8276914043886687


## ⚠️ 중고차 가격 예측 데이터
데이터 설명 : 중고차 가격 예측 데이터 (종속변수 :G3)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv  
데이터 출처 :https://www.kaggle.com/datasets/adityadesai13/used-car-dataset-ford-and-mercedes?select=vw.csv (참고, 데이터 수정)

In [78]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2
1,17314,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0
2,12342,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9
3,13426,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0
4,16004,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0


Unnamed: 0,carID,price
0,13207,31995
1,17314,7700
2,12342,58990
3,13426,12999
4,16004,16990


In [79]:
drop_col = ["carID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["price"]

#### 방법 1(야매)
- train에만 있고 test에는 없는 model들이 있기에 합쳐서 원핫 인코딩을 진행

In [80]:
combined = pd.concat([x_train_drop, x_test_drop]) 
combined_dummies = pd.get_dummies(combined)

x_train_dummies = combined_dummies[:len(x_train_drop)]
x_test_dummies = combined_dummies[len(x_test_drop) :]

#### 방법 2(정석)
- train과 동일한 컬럼을 만들고 test에서 없는 경우에는 0으로 채우고, test에는 있으나 train에 없는 경우에는 데이터 제거


In [81]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)

# train의 열 기준으로 재정렬하고, 없는 열은 0으로 채우기
x_test_dummies = x_test_dummies.reindex(columns = x_train_dummies.columns, fill_value = 0)

In [82]:
x_test_dummies.shape

(2672, 113)

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [84]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_valid)

In [85]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, r2_score
print("train mean_squared_error : ", mean_squared_error(Y_train, y_train_pred))
print("test mean_squared_error : ", mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_error : ", mean_absolute_error(Y_train, y_train_pred))
print("test mean_absolute_error : ", mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_train, y_train_pred))
print("test mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

print("train root_mean_squared_error : ", root_mean_squared_error(Y_train, y_train_pred))
print("test root_mean_squared_error : ", root_mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train r2_score : ", r2_score(Y_train, y_train_pred))
print("test r2_score : ", r2_score(Y_valid, y_valid_pred))

train mean_squared_error :  1986588.2508342846
test mean_squared_error :  17529155.091796063


train mean_absolute_error :  754.8038318414154
test mean_absolute_error :  2205.895988978373


train mean_absolute_percentage_error :  0.0364904279746303
test mean_absolute_percentage_error :  0.10665481251864602


train root_mean_squared_error :  1409.4638167878893
test root_mean_squared_error :  4186.783382478256


train r2_score :  0.9925293019211969
test r2_score :  0.9363581815226119


## 의료 비용 예측 데이터
데이터 설명 : 의료비용 예측문제 (종속변수 :charges)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv  
데이터 출처 :https://www.kaggle.com/mirichoi0218/insurance/code(참고, 데이터 수정)

In [86]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,age,sex,bmi,children,smoker,region
0,2,35,female,35.86,2,no,southeast
1,3,28,female,23.845,2,no,northwest
2,4,23,female,32.78,2,yes,southeast
3,6,52,female,25.3,2,yes,southeast
4,7,63,male,39.8,3,no,southwest


Unnamed: 0,ID,charges
0,2,5836.5204
1,3,4719.73655
2,4,36021.0112
3,6,24667.419
4,7,15170.069


In [87]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["charges"]

In [88]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [90]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_valid)

In [91]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, r2_score
print("train mean_squared_error : ", mean_squared_error(Y_train, y_train_pred))
print("test mean_squared_error : ", mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_error : ", mean_absolute_error(Y_train, y_train_pred))
print("test mean_absolute_error : ", mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_train, y_train_pred))
print("test mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

print("train root_mean_squared_error : ", root_mean_squared_error(Y_train, y_train_pred))
print("test root_mean_squared_error : ", root_mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train r2_score : ", r2_score(Y_train, y_train_pred))
print("test r2_score : ", r2_score(Y_valid, y_valid_pred))

train mean_squared_error :  3454796.8623625035
test mean_squared_error :  25529911.774004176


train mean_absolute_error :  1025.3255053751948
test mean_absolute_error :  2793.212601012995


train mean_absolute_percentage_error :  0.12639594706192733
test mean_absolute_percentage_error :  0.31589157326620154


train root_mean_squared_error :  1858.7083855092772
test root_mean_squared_error :  5052.713308115173


train r2_score :  0.9770049497653485
test r2_score :  0.8259510698381025


## 킹카운티 주거지 가격예측문제 데이터
데이터 설명 : 킹카운티 주거지 가격 예측문제 (종속변수 :price)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv  
데이터 출처 :https://www.kaggle.com/harlfoxem/housesalesprediction (참고, 데이터 수정)

In [92]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2,8651400730,20150428T000000,3,1.0,840,5525,1.0,0,0,...,6,840,0,1969,0,98042,47.3607,-122.085,920,5330
1,3,3163600130,20150317T000000,3,1.0,1250,8000,1.0,0,0,...,7,1250,0,1956,0,98146,47.5065,-122.337,1040,6973
2,4,5045700330,20140725T000000,4,2.5,2200,6400,2.0,0,0,...,8,2200,0,2010,0,98059,47.4856,-122.156,2600,5870
3,5,1036100130,20140808T000000,3,2.5,1980,39932,2.0,0,0,...,8,1980,0,1994,0,98011,47.7433,-122.196,2610,12769
4,6,7696630080,20140506T000000,3,1.75,1690,7735,1.0,0,0,...,7,1060,630,1976,0,98001,47.3324,-122.28,1580,7503


Unnamed: 0,ID,price
0,2,191000.0
1,3,234900.0
2,4,460000.0
3,5,442000.0
4,6,197000.0


In [93]:
x_train['date'] = pd.to_datetime(x_train['date'])
x_test['date'] = pd.to_datetime(x_test['date'])

x_train["year"] = x_train["date"].dt.year
x_test["year"] = x_test["date"].dt.year

x_train["month"] = x_train["date"].dt.month
x_test["month"] = x_test["date"].dt.month

x_train["day"] = x_train["date"].dt.day
x_test["day"] = x_test["date"].dt.day

In [94]:
drop_col = ["ID", "id", "date"]
x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)

In [95]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
y = y_train["price"]

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [97]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_valid)

In [98]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, r2_score
print("train mean_squared_error : ", mean_squared_error(Y_train, y_train_pred))
print("test mean_squared_error : ", mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_error : ", mean_absolute_error(Y_train, y_train_pred))
print("test mean_absolute_error : ", mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_train, y_train_pred))
print("test mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

print("train root_mean_squared_error : ", root_mean_squared_error(Y_train, y_train_pred))
print("test root_mean_squared_error : ", root_mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train r2_score : ", r2_score(Y_train, y_train_pred))
print("test r2_score : ", r2_score(Y_valid, y_valid_pred))

train mean_squared_error :  2631711324.8341956
test mean_squared_error :  20297689288.931293


train mean_absolute_error :  26445.86737051105
test mean_absolute_error :  72271.17606905013


train mean_absolute_percentage_error :  0.049567151764564825
test mean_absolute_percentage_error :  0.13339926249521628


train root_mean_squared_error :  51300.207843966826
test root_mean_squared_error :  142469.95925082345


train r2_score :  0.9803092298195151
test r2_score :  0.8670263922518943


## 대학원 입학가능성 데이터
데이터 설명 : 대학원 입학 가능성 예측 (종속변수 :Chance of Admit)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_test.csv  
데이터 출처 :https://www.kaggle.com/mohansacharya/graduate-admissions(참고, 데이터 수정)

In [99]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,0,67,327,114,3,3.0,3.0,9.02,0
1,1,112,321,109,4,4.0,4.0,8.68,1
2,2,495,301,99,3,2.5,2.0,8.45,1
3,3,356,317,106,2,2.0,3.5,8.12,0
4,4,250,321,111,3,3.5,4.0,8.83,1


Unnamed: 0,ID,Chance of Admit
0,0,0.61
1,1,0.69
2,2,0.68
3,3,0.73
4,4,0.77


In [100]:
drop_col = ["ID", "Serial No."]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["Chance of Admit"]

In [101]:
from sklearn.model_selection import train_test_split
X_train, X_Valid, Y_train, Y_valid = train_test_split(x_train_drop, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [102]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_Valid)

In [103]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, r2_score
print("train mean_squared_error : ", mean_squared_error(Y_train, y_train_pred))
print("test mean_squared_error : ", mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_error : ", mean_absolute_error(Y_train, y_train_pred))
print("test mean_absolute_error : ", mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_train, y_train_pred))
print("test mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

print("train root_mean_squared_error : ", root_mean_squared_error(Y_train, y_train_pred))
print("test root_mean_squared_error : ", root_mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train r2_score : ", r2_score(Y_train, y_train_pred))
print("test r2_score : ", r2_score(Y_valid, y_valid_pred))

train mean_squared_error :  0.000596769402985074
test mean_squared_error :  0.004077382575757574


train mean_absolute_error :  0.017350746268656736
test mean_absolute_error :  0.04627121212121217


train mean_absolute_percentage_error :  0.028308942949956506
test mean_absolute_percentage_error :  0.06964620608430996


train root_mean_squared_error :  0.024428864136203182
test root_mean_squared_error :  0.06385438572061886


train r2_score :  0.9713037619053357
test r2_score :  0.7712028992088238


## 레드 와인 퀄리티 예측 데이터
데이터 설명 : 레드 와인 퀄리티 예측문제 (종속변수 :quality)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_test.csv  
데이터 출처 :https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009(참고, 데이터 수정)

In [104]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1,10.6,0.44,0.68,4.1,0.114,6.0,24.0,0.997,3.06,0.66,13.4
1,2,7.0,0.6,0.3,4.5,0.068,20.0,110.0,0.99914,3.3,1.17,10.2
2,3,8.0,0.43,0.36,2.3,0.075,10.0,48.0,0.9976,3.34,0.46,9.4
3,4,7.9,0.53,0.24,2.0,0.072,15.0,105.0,0.996,3.27,0.54,9.4
4,5,8.0,0.45,0.23,2.2,0.094,16.0,29.0,0.9962,3.21,0.49,10.2


Unnamed: 0,ID,quality
0,1,6
1,2,5
2,3,5
3,4,6
4,5,6


In [105]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["quality"]

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_drop, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [107]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_valid)

In [108]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, r2_score
print("train mean_squared_error : ", mean_squared_error(Y_train, y_train_pred))
print("test mean_squared_error : ", mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_error : ", mean_absolute_error(Y_train, y_train_pred))
print("test mean_absolute_error : ", mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_train, y_train_pred))
print("test mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

print("train root_mean_squared_error : ", root_mean_squared_error(Y_train, y_train_pred))
print("test root_mean_squared_error : ", root_mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train r2_score : ", r2_score(Y_train, y_train_pred))
print("test r2_score : ", r2_score(Y_valid, y_valid_pred))

train mean_squared_error :  0.04558364485981308
test mean_squared_error :  0.3847130023640662


train mean_absolute_error :  0.15469626168224293
test mean_absolute_error :  0.45186761229314426


train mean_absolute_percentage_error :  0.028675303182020464
test mean_absolute_percentage_error :  0.0808550320837555


train root_mean_squared_error :  0.21350326662562585
test root_mean_squared_error :  0.6202523698979845


train r2_score :  0.9295817642098162
test r2_score :  0.4182584612264215


## 현대 차량 가격 분류문제 데이터
데이터 설명 : 현대 차량가격 분류문제 (종속변수 :price)  
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_train.csv  
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_train.csv  
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_test.csv  
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_test.csv  
데이터 출처 :https://www.kaggle.com/mysarahmadbhat/hyundai-used-car-listing(참고, 데이터 수정)

In [109]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_test.csv")

display(x_train.head())
display(y_train.head())

Unnamed: 0,ID,model,year,transmission,mileage,fuelType,tax(£),mpg,engineSize
0,0,I30,2019,Manual,21,Petrol,150,34.0,2.0
1,1,Santa Fe,2018,Semi-Auto,10500,Diesel,145,39.8,2.2
2,2,Tucson,2017,Manual,29968,Diesel,30,61.7,1.7
3,3,Kona,2018,Manual,27317,Petrol,145,52.3,1.0
4,4,Tucson,2018,Semi-Auto,31459,Diesel,145,57.7,1.7


Unnamed: 0,ID,price
0,0,23995
1,1,28490
2,2,13251
3,3,14990
4,4,17591


In [110]:
drop_col = ["ID"]

x_train_drop = x_train.drop(columns = drop_col)
x_test_drop = x_test.drop(columns = drop_col)
y = y_train["price"]

In [111]:
x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummeis = x_test_dummies.reindex(columns = x_train_dummies.columns, fill_value = 0)

In [112]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_dummies, y, test_size = 0.33, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 23)

rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [113]:
y_train_pred = rf.predict(X_train)
y_valid_pred = rf.predict(X_valid)

In [114]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, r2_score
print("train mean_squared_error : ", mean_squared_error(Y_train, y_train_pred))
print("test mean_squared_error : ", mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_error : ", mean_absolute_error(Y_train, y_train_pred))
print("test mean_absolute_error : ", mean_absolute_error(Y_valid, y_valid_pred))
print("\n")

print("train mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_train, y_train_pred))
print("test mean_absolute_percentage_error : ", mean_absolute_percentage_error(Y_valid, y_valid_pred))
print("\n")

print("train root_mean_squared_error : ", root_mean_squared_error(Y_train, y_train_pred))
print("test root_mean_squared_error : ", root_mean_squared_error(Y_valid, y_valid_pred))
print("\n")

print("train r2_score : ", r2_score(Y_train, y_train_pred))
print("test r2_score : ", r2_score(Y_valid, y_valid_pred))

train mean_squared_error :  228437.56107188563
test mean_squared_error :  1564725.6191337807


train mean_absolute_error :  321.2707706725878
test mean_absolute_error :  844.370221594972


train mean_absolute_percentage_error :  0.02695461570100577
test mean_absolute_percentage_error :  0.06812765355214671


train root_mean_squared_error :  477.95142124685185
test root_mean_squared_error :  1250.889930862736


train r2_score :  0.9931718409694613
test r2_score :  0.9574141985048705
