# 2회 기출

전자상거래 배송 데이터

제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기

학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측 확률값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

ID, Reached.on.Time_Y.N
4733,0.6
2040,0.8
5114,0.45
2361,0.23
5996,0.43
[시험용 데이터셋 만들기] 코드는 예시문제와 동일한 형태의 X_train, y_train, X_test 데이터를 만들기 위함임

(유의사항)

성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피처엔지니어링, 분류알고리즘, 하이퍼파라미터 튜닝, 모형 앙상블 등이 수반되어야 한다.
수험번호.csv파일이 만들어지도록 코드를 제출한다.
제출한 모델의 성능은 ROC-AUC형태로 읽어들인다.

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    

In [2]:
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X_train.info()
y_train.info()

(8799, 11) (2200, 11) (8799, 2) (2200, 2)
<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 3999 to 9332
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 824.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 3999 to 9332
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype
---  ------ 

In [3]:
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

# feature selection
X_train.head()

X_train = X_train.drop("ID", axis=1)
y_train = y_train.drop("ID", axis=1)
y_train = y_train.squeeze()

ordinal_col = ["Product_importance"]
s_col = X_train.select_dtypes(include="object").columns
n_col = X_train.select_dtypes(exclude="object").columns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
scaler = MinMaxScaler()
encoder = OneHotEncoder()

X_train[n_col] = scaler.fit_transform(X_train[n_col])
X_test[n_col] = scaler.transform(X_test[n_col])

encoded = encoder.fit_transform(X_train[s_col]).toarray()
en_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(s_col), index=X_train.index)
X_train = pd.concat([X_train.drop(s_col, axis=1), en_df], axis=1)

encoded = encoder.transform(X_test[s_col]).toarray()
en_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(s_col), index=X_test.index)
X_test = pd.concat([X_test.drop(s_col, axis=1), en_df], axis=1)

# train validation set split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# modeling
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

from sklearn.metrics import roc_auc_score, accuracy_score

roc_score = roc_auc_score(y_val, y_val_pred)
acc_score = accuracy_score(y_val, y_val_pred)

print(roc_score, acc_score)

y_test_proba = model.predict_proba(X_test.drop("ID", axis=1))[:,-1]

result = pd.DataFrame({"ID":X_test["ID"].reset_index(drop=True), "Reached.on.Time_Y.N":y_test_proba})
print(result.describe())

(7039, 19) (1760, 19) (7039,) (1760,)
0.6772348419675333 0.6744318181818182
                 ID  Reached.on.Time_Y.N
count   2200.000000          2200.000000
mean    5617.505000             0.586177
std     3154.909089             0.251641
min        2.000000             0.130000
25%     2940.750000             0.400000
50%     5566.500000             0.500000
75%     8406.250000             0.820000
max    10993.000000             1.000000


# 3회 작업형2 기출 유형(심화)

여행 보험 패키지 상품을 구매할 확률 값을 구하시오

예측할 값(y): TravelInsurance (여행보험 패지지를 구매 했는지 여부 0:구매안함, 1:구매)

평가: roc-auc 평가지표

data: t2-1-train.csv, t2-1-test.csv

제출 형식

id,TravelInsurance

0,0.3

1,0.48

2,0.3

3,0.83

In [4]:
import pandas as pd

train = pd.read_csv("../data/t2-1-train.csv")
test = pd.read_csv("../data/t2-1-test.csv")

train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1490 non-null   int64  
 1   Age                  1490 non-null   int64  
 2   Employment Type      1490 non-null   object 
 3   GraduateOrNot        1490 non-null   object 
 4   AnnualIncome         1486 non-null   float64
 5   FamilyMembers        1490 non-null   int64  
 6   ChronicDiseases      1490 non-null   int64  
 7   FrequentFlyer        1490 non-null   object 
 8   EverTravelledAbroad  1490 non-null   object 
 9   TravelInsurance      1490 non-null   int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 116.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                  

In [5]:
train.head()

Unnamed: 0,id,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,10000,28,Private Sector/Self Employed,Yes,1250000.0,6,1,No,No,0
1,10001,31,Private Sector/Self Employed,Yes,1250000.0,7,1,No,No,0
2,10002,29,Private Sector/Self Employed,Yes,1200000.0,7,0,No,No,1
3,10003,33,Government Sector,Yes,650000.0,6,1,No,No,1
4,10004,28,Private Sector/Self Employed,Yes,800000.0,6,0,No,Yes,1


In [6]:
import pandas as pd
import numpy as np

train = pd.read_csv("../data/t2-1-train.csv")
test = pd.read_csv("../data/t2-1-test.csv")

train = train.drop("id", axis=1)

id = test["id"].reset_index(drop=True)
test = test.drop("id", axis=1)

# print(test["Employment Type"].unique())

train, y = train.drop("TravelInsurance", axis=1), train["TravelInsurance"]

n_col = train.select_dtypes(exclude="object").columns

# print(test[test["AnnualIncome"].isna()])

# 수입 결측치는 employment type의 중위값으로 대체
income_by_type = train.groupby(["Employment Type"])["AnnualIncome"].median().to_dict()
train["AnnualIncome"] = train["AnnualIncome"].fillna(train["Employment Type"].map(income_by_type))
test["AnnualIncome"] = test["AnnualIncome"].fillna(test["Employment Type"].map(income_by_type))

from sklearn.preprocessing import MinMaxScaler
# print(train["Age"].describe())
scaler = MinMaxScaler()

train[n_col] = scaler.fit_transform(train[n_col])
test[n_col] = scaler.transform(test[n_col])

c_col = train.select_dtypes(include="object").columns.tolist()
del c_col[0]

for col in c_col:
    train[col] = train[col].map({"Yes":1, "No":0})
    test[col] = test[col].map({"Yes":1, "No":0})

train = pd.get_dummies(train, columns=["Employment Type"])
test = pd.get_dummies(test, columns=["Employment Type"], drop_first=True)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
train_X, val_X, train_y, val_y = train_test_split(train, y, test_size=0.15)
# print(train_X.shape, train_y.shape, val_X.shape, val_y.shape)

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)

model.fit(train_X, train_y)
pred_y = model.predict(val_X)

from sklearn.metrics import roc_auc_score, accuracy_score
roc_score = roc_auc_score(val_y, model.predict_proba(val_X)[:, 1])
accuracy_score = accuracy_score(val_y, pred_y)
print(roc_score, accuracy_score)

prob_test_y = model.predict_proba(test)[:,-1]
result = pd.DataFrame({"id":id, "TravelInsurance":prob_test_y})
print(result.head())

0.7608467917939764 0.8214285714285714
   id  TravelInsurance
0   0         0.265491
1   1         0.173876
2   2         0.026807
3   3         0.960381
4   4         0.035124


# 4회
### [마케팅] 자동차 시장 세분화

자동차 회사는 새로운 전략을 수립하기 위해 4개의 시장으로 세분화했습니다.

기존 고객 분류 자료를 바탕으로 신규 고객이 어떤 분류에 속할지 예측해주세요!

예측할 값(y): "Segmentation" (1,2,3,4)

평가: Macro f1-score

data: train.csv, test.csv

제출 형식:

ID,Segmentation

458989,1

458994,2

459000,3

459003,4

답안 제출 참고

아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용

pd.DataFrame({'ID': test.ID, 'Segmentation': pred}).to_csv('003000000.csv', index=False)

노트북 구분

basic: 수치형 데이터만 활용 -> 학습 및 test데이터 예측

intermediate: 범주형 데이터도 활용 -> 학습 및 test데이터 예측

advanced: 학습 및 교차 검증(모델 평가) -> 하이퍼파라미터 튜닝 -> test데이터 예측

학습을 위한 채점

최종 파일을 "수험번호.csv"가 아닌 "submission.csv" 작성 후 오른쪽 메뉴 아래 "submit" 버튼 클릭 -> 리더보드에 점수 및 등수 확인 가능함

pd.DataFrame({'ID': test.ID, 'Segmentation': pred}).to_csv('submission.csv', index=False)

In [7]:
import pandas as pd
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,4
1,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,2
2,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,2
3,461319,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,3
4,460156,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,3


In [8]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

y = train["Segmentation"]
train = train.drop(["ID", "Segmentation"], axis=1)
id = test["ID"].reset_index(drop=True)
test = test.drop("ID", axis=1)

c_col = train.select_dtypes(include="object").columns
# for col in c_col:
#     print(set(train[col].unique()) == set(test[col].unique()))
#     print(train[col].unique())

# print(test.isna().sum())

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
encoder = LabelEncoder()
scaler = MinMaxScaler()

for col in c_col:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])
    
n_col = train.select_dtypes(exclude="object").columns

train[n_col] = scaler.fit_transform(train[n_col])
test[n_col] = scaler.transform(test[n_col])

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(train, y, test_size=0.2, random_state=3)
# print(train_X.shape, val_X.shape, train_y.shape, val_y.shape)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=32)
model.fit(train_X, train_y)

val_y_pred = model.predict(val_X)

from sklearn.metrics import f1_score
score = f1_score(val_y, val_y_pred, average="macro")
print(score)

test_pred = model.predict(test)
result = pd.DataFrame({"ID":id, "Segmentation":test_pred})
print(result)

0.49994003700664374
          ID  Segmentation
0     458989             2
1     458994             3
2     459000             3
3     459003             3
4     459005             1
...      ...           ...
2149  467950             1
2150  467954             4
2151  467958             1
2152  467961             3
2153  467968             4

[2154 rows x 2 columns]


#유형2 5회 기출

[가격 예측] 중고 자동차

자동차 가격을 예측해주세요!

예측할 값(y): price

평가: RMSE (Root Mean Squared Error)

data: train.csv, test.csv

[컴피티션 제출 양식] 리더보드 제출용

제출 형식: submission.csv파일을 아래와 같은 형식(수치형)으로 제출

(id는 test의 index임)

id,price

0,11000

1,20500

2,19610

...    

1616,11995

[실제 시험용 제출 양식] 참고

제출 형식: result.csv파일을 아래와 같은 형식(수치형)으로 제출

pred

11000

20500

19610

...    

11995

답안 제출 참고

pd.read_csv('result.csv') 로 제출 코드 확인

In [9]:
import pandas as pd

train = pd.read_csv("../data/5_train.csv")
test = pd.read_csv("../data/5_test.csv")

train_y = train["price"]
train_X = train.drop("price", axis=1)

print(train_X.head())

       model  year transmission  mileage fuelType  tax   mpg  engineSize
0   EcoSport  2017       Manual    25013   Petrol  150  53.3         1.0
1      Focus  2016       Manual    30970   Diesel    0  74.3         1.5
2      S-MAX  2017       Manual    60200   Diesel  150  56.5         2.0
3     Fiesta  2018       Manual    27380   Petrol  145  56.5         1.0
4     Fiesta  2018       Manual     7724   Petrol  145  65.7         1.0


In [10]:
train = pd.read_csv("../data/5_train.csv")
test = pd.read_csv("../data/5_test.csv")

# 이상치 drop
train_X.drop(train_X[train_X["year"] > 2024].index, inplace=True)

train_y = train["price"]
train_X = train.drop("price", axis=1)

c_col = train_X.select_dtypes(include="object").columns
# for col in c_col:
#     print(col, ":", len(train_X[col].unique()))
#     print(set(train_X[col].unique()))
#     print(set(train_X[col].unique()) == set(test[col].unique()))

train_X = pd.get_dummies(train_X, columns=c_col, drop_first=True)
test = pd.get_dummies(test, columns=c_col, drop_first=True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
n_col = train_X.select_dtypes(exclude="object").columns

train_X[n_col] = scaler.fit_transform(train_X[n_col])
test[n_col] = scaler.transform(test[n_col])

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.15, random_state=4)
# print(train_X.shape, val_X.shape, train_y.shape, val_y.shape)

from sklearn.ensemble import RandomForestRegressor

# model = RandomForestRegressor(random_state=42, n_estimators=200)

import lightgbm as lgb
model = lgb.LGBMRegressor(random_state=0)

model.fit(train_X, train_y)
pred_val_y = model.predict(val_X)

from sklearn.metrics import root_mean_squared_error

r_mse = root_mean_squared_error(pred_val_y, val_y)
print(r_mse)

pred_test = model.predict(test)
result = pd.DataFrame({"pred":pred_test})
result["pred"] = result["pred"].astype(int)

print(result)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 410
[LightGBM] [Info] Number of data points in the train set: 3195, number of used features: 20
[LightGBM] [Info] Start training from score 12301.849452
1324.6130926132332
       pred
0     15803
1     15753
2     14572
3     17458
4      6210
...     ...
1612  11037
1613  17649
1614   9477
1615  14112
1616   5816

[1617 rows x 1 columns]
