# Google Drive 연동

In [35]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 데이터 불러오기

In [36]:
import pandas as pd

DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/2024/한국SW기술진흥협회/Python기초통계/작업장_제3유형/기출문제/작업2유형/data/'

X_train = pd.read_csv(DATA_PATH + "titanic_reg_X_train.csv")
X_test = pd.read_csv(DATA_PATH + "titanic_reg_X_test.csv")
y_train = pd.read_csv(DATA_PATH + "titanic_reg_y_train.csv")

X_train.shape, X_test.shape, y_train.shape

((623, 11), (268, 11), (623, 1))

# 데이터 확인

In [37]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
0,446,1,1,"Dodge, Master. Washington",male,4.0,0,2,33638,A34,S
1,651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,,S
2,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,,S
3,451,0,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,,S
4,315,0,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,,S


In [38]:
X_test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
0,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,,C
1,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,,S
2,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,,S
3,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,,S
4,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,,C


In [39]:
y_train.head(1)

Unnamed: 0,Fare
0,81.8583


# 결측치 확인

In [40]:
X_train.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,124
SibSp,0
Parch,0
Ticket,0
Cabin,484


In [41]:
X_test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,53
SibSp,0
Parch,0
Ticket,0
Cabin,203


# 불필요한 컬럼 제거
- 패턴이 발견되지 않을 것 같은 컬럼 삭제
  + ID 컬럼 삭제

In [42]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'Los Angeles', 'Chicago']
})

df

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


## pop 메서드 활용

In [43]:
name_series = df.pop('name')
name_series.head(1)

Unnamed: 0,name
0,Alice


In [44]:
df.head()

Unnamed: 0,age,city
0,25,New York
1,30,Los Angeles
2,35,Chicago


In [45]:
X_train_ID = X_train.pop('PassengerId')
X_test_ID = X_test.pop('PassengerId')

In [46]:
X_train.head(1)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
0,1,1,"Dodge, Master. Washington",male,4.0,0,2,33638,A34,S


In [47]:
X_test.head(1)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
0,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,,C


# 컬럼분리

In [48]:
import numpy as np

cat_cols = X_train.select_dtypes(exclude = np.number).columns.tolist()
num_cols = X_train.select_dtypes(include = np.number).columns.tolist()

print(cat_cols, num_cols)

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'] ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch']


# 데이터셋 분리
- 훈련데이터와 검증데이터 분리

In [49]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train['Fare'],
    test_size=0.3,
    random_state=42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((436, 10), (187, 10), (436,), (187,))

# Pipeline 모델
- 결측치가 포함된 모델 훈련

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# StratifiedKFold : 분류 작업 할 때
# KFold : 수치 작업 할 때
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, KFold

import numpy as np
from scipy.stats import uniform, randint

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(random_state=42))
])

param_distributions = {
    'classifier__min_impurity_decrease': uniform(0.0001, 0.001),
    'classifier__max_depth': randint(20, 50),
    'classifier__min_samples_split': randint(2, 25),
    'classifier__min_samples_leaf': randint(1, 25),
}

split_number = 5
# stratified_kfold, 분류모형 만들 때
# stratified_kfold = StratifiedKFold(n_splits=split_number, shuffle=True, random_state=42)

kfold = KFold(n_splits=split_number, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    cv=kfold, # stratified_kfold, 분류모형 만들 때
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


# 모형평가

In [51]:
from sklearn.metrics import mean_squared_error
import numpy as np

def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict(X_tr)
    val_pred = model.predict(X_val)
    tr_score = np.sqrt(mean_squared_error(y_tr, tr_pred))
    val_score = np.sqrt(mean_squared_error(y_val, val_pred))
    return f"train: {tr_score}, validation: {val_score}"

best_model = random_search.best_estimator_
get_score(best_model, X_tr, X_val, y_tr, y_val)

'train: 7.44245450554296, validation: 10.149731372210258'

In [52]:
final_preds = best_model.predict(X_test)
result = pd.DataFrame({
    "ID" : X_test_ID,
    "preds" : final_preds
})

result

Unnamed: 0,ID,preds
0,710,17.119833
1,440,12.898116
2,841,7.910236
3,721,26.188532
4,40,13.203950
...,...,...
263,822,7.919886
264,634,28.898874
265,457,29.543766
266,501,8.028290


# 데이터 불러오기

In [53]:
import pandas as pd

DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/2024/빅분기/작업2유형/data/'

train = pd.read_csv(DATA_PATH + "churn_train.csv")
test = pd.read_csv(DATA_PATH + "churn_test.csv").drop('TotalCharges', axis=1)
train.shape, test.shape

((699, 10), (301, 9))

## 모델 만들기

In [54]:
train_ID = train.pop('CustomerID')
test_ID = test.pop('CustomerID')

train.shape, test.shape

((699, 9), (301, 8))

In [55]:
y = train.pop('TotalCharges')
train.shape, y.shape

((699, 8), (699,))

In [56]:
import numpy as np

cat_cols = train.select_dtypes(exclude = np.number).columns.tolist()
num_cols = train.select_dtypes(include = np.number).columns.tolist()
print(cat_cols, num_cols)

['Gender', 'ContractType', 'InternetService', 'TechSupport', 'Churn'] ['Age', 'Tenure', 'MonthlyCharges']


In [57]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    train, y,
    test_size=0.3,
    random_state=42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((489, 8), (210, 8), (489,), (210,))

In [63]:
from lightgbm import LGBMRegressor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor


# StratifiedKFold : 분류 작업 할 때
# KFold : 수치 작업 할 때
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, KFold

import numpy as np
from scipy.stats import uniform, randint
from sklearn.preprocessing import MinMaxScaler

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMRegressor(random_state=42))
])

param_distributions = {
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__max_depth': randint(3, 15),
    'classifier__num_leaves': randint(20, 50),
    'classifier__min_child_samples': randint(5, 30),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__colsample_bytree': uniform(0.7, 0.3),
}

split_number = 5
# stratified_kfold, 분류모형 만들 때
# stratified_kfold = StratifiedKFold(n_splits=split_number, shuffle=True, random_state=42)

kfold = KFold(n_splits=split_number, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    cv=kfold, # stratified_kfold, 분류모형 만들 때
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_tr, y_tr)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 286
[LightGBM] [Info] Number of data points in the train set: 489, number of used features: 14
[LightGBM] [Info] Start training from score 1405.145763


## 모형평가

In [68]:
from sklearn.metrics import mean_absolute_error
import numpy as np

def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict(X_tr)
    val_pred = model.predict(X_val)
    tr_score = mean_absolute_error(y_tr, tr_pred)
    val_score = mean_absolute_error(y_val, val_pred)
    return f"train: {tr_score}, validation: {val_score}"

get_score(random_search, X_tr, X_val, y_tr, y_val)

'train: 17.41921532709209, validation: 72.75306713770637'

## 예측결과 코드

In [69]:
final_preds = random_search.predict(test)
result = pd.DataFrame({
    "CustomerID" : test_ID,
    "preds": final_preds
})

result

Unnamed: 0,CustomerID,preds
0,cust658,510.354815
1,cust776,1619.125634
2,cust325,2705.403889
3,cust487,2200.201452
4,cust366,3358.507037
...,...,...
296,cust122,-1.243881
297,cust312,334.126868
298,cust977,982.816593
299,cust495,1136.084333
