# 2회 기출

전자상거래 배송 데이터

제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기

학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측 확률값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

ID, Reached.on.Time_Y.N
4733,0.6
2040,0.8
5114,0.45
2361,0.23
5996,0.43
[시험용 데이터셋 만들기] 코드는 예시문제와 동일한 형태의 X_train, y_train, X_test 데이터를 만들기 위함임

(유의사항)

성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피처엔지니어링, 분류알고리즘, 하이퍼파라미터 튜닝, 모형 앙상블 등이 수반되어야 한다.
수험번호.csv파일이 만들어지도록 코드를 제출한다.
제출한 모델의 성능은 ROC-AUC형태로 읽어들인다.

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    

In [2]:
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X_train.info()
y_train.info()

(8799, 11) (2200, 11) (8799, 2) (2200, 2)
<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 3999 to 9332
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 824.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 3999 to 9332
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype
---  ------ 

In [4]:
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

# feature selection
X_train.head()

X_train = X_train.drop("ID", axis=1)
y_train = y_train.drop("ID", axis=1)
y_train = y_train.squeeze()

ordinal_col = ["Product_importance"]
s_col = X_train.select_dtypes(include="object").columns
n_col = X_train.select_dtypes(exclude="object").columns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
scaler = MinMaxScaler()
encoder = OneHotEncoder()

X_train[n_col] = scaler.fit_transform(X_train[n_col])
X_test[n_col] = scaler.transform(X_test[n_col])

encoded = encoder.fit_transform(X_train[s_col]).toarray()
en_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(s_col), index=X_train.index)
X_train = pd.concat([X_train.drop(s_col, axis=1), en_df], axis=1)

encoded = encoder.transform(X_test[s_col]).toarray()
en_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(s_col), index=X_test.index)
X_test = pd.concat([X_test.drop(s_col, axis=1), en_df], axis=1)

# train validation set split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# modeling
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

from sklearn.metrics import roc_auc_score, accuracy_score

roc_score = roc_auc_score(y_val, y_val_pred)
acc_score = accuracy_score(y_val, y_val_pred)

print(roc_score, acc_score)

y_test_proba = model.predict_proba(X_test.drop("ID", axis=1))[:,-1]

result = pd.DataFrame({"ID":X_test["ID"].reset_index(drop=True), "Reached.on.Time_Y.N":y_test_proba})
print(result.describe())

(7039, 19) (1760, 19) (7039,) (1760,)
0.678275180768612 0.6647727272727273
                 ID  Reached.on.Time_Y.N
count   2200.000000          2200.000000
mean    5617.505000             0.578527
std     3154.909089             0.256034
min        2.000000             0.130000
25%     2940.750000             0.390000
50%     5566.500000             0.490000
75%     8406.250000             0.820000
max    10993.000000             1.000000
