In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
%cd /content/drive/MyDrive/Colab Notebooks/머신러닝1/조별활동/와인품질분류/data

/content/drive/MyDrive/Colab Notebooks/머신러닝1/조별활동/와인품질분류/data


In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 전처리

## label_encoder

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(train['type'])
train['type'] = label_encoder.transform(train['type'])
test['type'] = label_encoder.transform(test['type'])

# 스케일링

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

In [7]:
minmax_col_list = ['fixed acidity', 'citric acid' ,'chlorides','free sulfur dioxide','density','pH','sulphates']
robust_col_list = ['volatile acidity' ,'total sulfur dioxide', 'residual sugar','alcohol']

In [8]:
min_max_scaler = MinMaxScaler()
scaled_train_data = min_max_scaler.fit_transform(train[minmax_col_list])
scaled_test_data = min_max_scaler.transform(test[minmax_col_list])

train[minmax_col_list] = scaled_train_data
test[minmax_col_list] = scaled_test_data

In [9]:
robust_scaler = RobustScaler()
scaled_train_data = robust_scaler.fit_transform(train[robust_col_list])
scaled_test_data = robust_scaler.transform(test[robust_col_list])

train[robust_col_list] = scaled_train_data
test[robust_col_list] = scaled_test_data

# upsampling

In [10]:
from imblearn.over_sampling import SMOTE

In [11]:
X = train.drop('quality', axis=1)  # 입력 변수
y = train['quality']  # 타겟 변수

smote = SMOTE(k_neighbors=4)
X_resampled, y_resampled = smote.fit_resample(X, y)


train_resampled = pd.concat([X_resampled, y_resampled], axis=1)
train_resampled

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,quality
0,0,0.148760,2.382353,0.036145,0.603175,0.054908,0.027778,-0.441558,0.139001,0.551181,0.123596,-0.055556,1,5
1,1,0.413223,1.882353,0.084337,-0.095238,0.096506,0.031250,-0.987013,0.188741,0.354331,0.207865,-0.444444,0,5
2,2,0.338843,-0.470588,0.234940,-0.158730,0.079867,0.069444,0.259740,0.089647,0.244094,0.168539,0.333333,1,5
3,3,0.264463,-0.470588,0.186747,0.476190,0.061564,0.097222,-0.129870,0.130904,0.409449,0.157303,0.277778,1,6
4,4,0.330579,0.647059,0.156627,1.031746,0.083195,0.107639,0.779221,0.161751,0.236220,0.117978,0.333333,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16907,2528,0.266693,0.232496,0.207879,0.101790,0.020468,0.167990,0.078952,0.055533,0.425197,0.095773,1.305115,1,9
16908,2832,0.247164,0.411765,0.193844,0.040103,0.016794,0.152694,-0.147902,0.050807,0.462494,0.129827,1.272606,1,9
16909,3006,0.296012,0.295836,0.219330,0.412929,0.021169,0.172395,0.027206,0.082256,0.411403,0.090954,1.053529,1,9
16910,3143,0.267625,0.378493,0.208984,0.254320,0.016753,0.188116,0.017068,0.060584,0.421238,0.082182,1.253030,1,9


# 모델링

In [12]:
y = train_resampled['quality'] #퀄리티 예측해야할거
X = train_resampled.drop('quality', axis=1) #예측해야할거빼고 나머지

In [17]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [18]:
model = xgb.XGBClassifier(tree_method='gpu_hist')

stratified_split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

n_iter = 0
for train_idx, test_idx in stratified_split.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

     # 매개변수 그리드 정의
    param = {
      'max_depth': [3, 4, 5],
      'random_state': [0, 42],
      'learning_rate': [0.1, 0.01, 0.001],
      'n_estimators': [100, 500, 1000],
      'subsample': [0.8, 0.9, 1.0],
      'colsample_bytree': [0.8, 0.9, 1.0]
    }
    # GridSearchCV 객체 생성
    grid_search = GridSearchCV(model, param_grid=param, cv=5, refit=True, n_jobs=-1, return_train_score=True, verbose=2, error_score='raise')
    # 훈련 데이터로 모델 학습 및 최적 매개변수 탐색
    grid_search.fit(X_train, y_train)
    # 최적 매개변수 출력
    print("최적 매개변수:", grid_search.best_params_)
    # 최적 매개변수로 모델 재학습
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

y_pred = best_model.predict(test)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


ValueError: ignored

# 제출파일

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission

In [None]:
submission['quality'] = y_pred

In [None]:
submission

In [None]:
submission.to_csv('scale_upsampling_xboost.csv', index=False)