In [None]:
import pandas as pd
import numpy as np
import warnings
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

# Suppress LightGBM info messages
warnings.filterwarnings("ignore", category=UserWarning)

# Read the training and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('testset.csv')

train_df=train_df.fillna(0)
test_df=test_df.fillna(0)


In [None]:

# Define categorical columns for label encoding
categorical_columns = ["Gender", "V17", "V19"]

# Label Encoding
label_encoder = LabelEncoder()
for column in categorical_columns:
    train_df[column] = label_encoder.fit_transform(train_df[column])
    test_df[column] = label_encoder.transform(test_df[column])

In [None]:

# Split the data into features (X) and target (y)
X = train_df.drop(columns=['class'])
y = train_df['class']
y = y.replace({'N': 0, 'Y': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
cat_model = CatBoostClassifier(random_state=42)
xgb_cls = XGBClassifier(random_state=42)
gra_model = GradientBoostingClassifier(random_state=42)
logistic_model = LogisticRegression(random_state=42)
lightgbm = LGBMClassifier(random_state=42)

models_list = [rf_classifier,cat_model,xgb_cls, gra_model]

result = []

# Perform cross-validation for each model
for model in models_list:
    print(model)
    scores = cross_val_score(model, X_train[['V5',
   'V6',
   'V7',
   'V8',
   'V9',
   'V10',
   'V11',
   'V12',
   'V13',
   'V14',
   'V15',
   'V16',
   'V19',
   'V22']], y_train, cv=5)
    result.append(scores.mean())

# Print model results
print("Model results:")
print(result)
#[0.8903225806451612, 0.8833333333333334, 0.875268817204301, 0.8887096774193548]
#[0.8903225806451612, 0.8833333333333334, 0.875268817204301, 0.8890681003584231] standard

In [None]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from sklearn.metrics import f1_score

celect_col=['V5',
   'V6',
   'V7',
   'V8',
   'V9',
   'V10',
   'V11',
   'V12',
   'V13',
   'V14',
   'V15',
   'V16',
   'V19',
   'V22']
def objectiveCAT(trial: Trial, x_tr, y_tr, x_val, y_val):
    param = {
        'iterations' : trial.suggest_int('iterations', 100, 1000),
        'depth' : trial.suggest_int('depth', 3, 8),
        'learning_rate' : trial.suggest_float('learning_rate', 0.001, 0.1),
        'random_state' : 42}
        #'class_weights': weight}
    # 학습 모델 생성
    model = CatBoostClassifier(**param)
    cat_model = model.fit(x_tr, y_tr)
    
    # 모델 성능 확인
    pred = cat_model.predict(x_val)
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 하이퍼 파라미터 튜닝

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0))
study.optimize(lambda trial : objectiveCAT(trial, X_train[celect_col], y_train, X_test[celect_col], y_test), n_trials = 30)

print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

cat_model=CatBoostClassifier(**study.best_trial.params,random_state=42)

In [None]:
from sklearn.ensemble import VotingClassifier

# Define the individual classifiers
xgb_cls = XGBClassifier(random_state=42)
lightgbm = LGBMClassifier(random_state=42)

# Create a list of tuples, where each tuple contains a name for the classifier
# and the classifier instance
classifiers = [
    ('XGBoost', xgb_cls),
    ('CatBoost', cat_model),
    ('LightGBM', lightgbm)
]

# Create a VotingClassifier using a soft voting strategy
voting_model = VotingClassifier(estimators=classifiers, voting='soft')

# Fit the VotingClassifier to the training data
voting_model.fit(X_train[['V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V19', 'V22']], y_train)

# Perform cross-validation for the voting model
scores = cross_val_score(voting_model, X_train[['V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V19', 'V22']], y_train, cv=5)

cat_scores = cross_val_score(cat_model, X_train[['V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V19', 'V22']], y_train, cv=5)
print("Voting Model Results:")
print(scores.mean())

print("cat Model Results:")
print(cat_scores.mean())


In [None]:
model=cat_model
# Train the LightGBM model on the entire dataset
model.fit(X[['V5',
   'V6',
   'V7',
   'V8',
   'V9',
   'V10',
   'V11',
   'V12',
   'V13',
   'V14',
   'V15',
   'V16',
   'V19',
   'V22']], y)

# Make predictions on the test data
pred = model.predict(test_df[['V5',
   'V6',
   'V7',
   'V8',
   'V9',
   'V10',
   'V11',
   'V12',
   'V13',
   'V14',
   'V15',
   'V16',
   'V19',
   'V22']])

# Convert NumPy array to DataFrame and save as a CSV file
result = np.where(pred == 0, 'N', 'Y')
output_csv = pd.DataFrame(result, columns=['class'])
output_csv.to_csv("32184801_하승원.csv", index=False)


In [None]:
#Your ranking: 1 , acc: 0.896851975887475 , total upload: 1
#Your ranking: 1 , acc: 0.912257200267917 , total upload: 2
#Your ranking: 1 , acc: 0.912257200267917 , total upload: 3
#Your ranking: 1 , acc: 0.916275954454119 , total upload: 4
#Your ranking: 2 , acc: 0.916275954454119 , total upload: 5
#Your ranking: 3 , acc: 0.916275954454119 , total upload: 8
#Your ranking: 3 , acc: 0.916275954454119 , total upload: 9

# 별 의미없는거


In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
#scaler = QuantileTransformer(output_distribution='normal')


# # Select the columns to be scaled
# columns_to_scale = ["V8", "V13", "V20", "V21", "V22"]
# train_df[columns_to_scale] = np.log1p(train_df[columns_to_scale])
# test_df[columns_to_scale] = np.log1p(test_df[columns_to_scale])
# # Initialize the StandardScaler

# #scaler = RobustScaler()
# scaler = QuantileTransformer(output_distribution='normal')
# # Fit and transform the selected columns in the training data
# train_df[columns_to_scale] = scaler.fit_transform(train_df[columns_to_scale])

# # Transform the corresponding columns in the test data
# test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])

In [None]:
from sklearn.feature_selection import RFE
import pandas as pd
from sklearn.model_selection import cross_validate,cross_val_score

model=cat_model

rfe_result=[]
rfe_feature=[]
for i in range(1,len(X.columns)):
    rfe= RFE(model, n_features_to_select=i)
    fit = rfe.fit(X, y)
    fs = X.columns[fit.support_].tolist()   # selected features
    rfe_feature.append(fs)
    scores = cross_val_score(model, X[fs], y, cv=5)
    rfe_result.append(scores.mean())

print(rfe_result)
print(rfe_feature)
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# from sklearn.feature_selection import RFE
# import pandas as pd
# from sklearn.model_selection import cross_validate,cross_val_score

# #forward selection

# sfs1 = SFS(model, k_features=len(X.columns),          # number of features
#             verbose=2,scoring='accuracy',cv=5)
# sfs1 = sfs1.fit(X, y)
# print(sfs1.subsets_)             # selection process
# print(sfs1.k_feature_idx_)       # selected feature index  
# print(sfs1.k_feature_names_)
