In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame as df

import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, RepeatedStratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier 

In [2]:
x_train_path = 'bda_q2/X_train.csv'
x_test_path = 'bda_q2/X_test.csv'
y_train_path = 'bda_q2/y_train.csv'

x_train = pd.read_csv(x_train_path, encoding='cp949')
y_train = pd.read_csv(y_train_path, encoding='cp949')
x_test = pd.read_csv(x_test_path, encoding='cp949')

def MissingColumn(df_):
    missing_heads = []
    for head in df_.keys():
        if df_[head].isnull().sum() != 0:
            missing_heads.append(head)
    return missing_heads

def OutlierColumn(df_):
    negative_cols = []
    outlier_cols = []
    for head in df_.keys():
        try:
            mini = df_[head].min()
            if mini < 0:
                negative_cols.append(head)
            Q1 = df_[head].quantile(.25)
            Q3 = df_[head].quantile(.75)
            IQR = Q3-Q1
            up_bound = Q3 + IQR*3/2
            dw_bound = Q1 - IQR*3/2
            count = 0
            for value in df_[head]:
                if value > up_bound:
                    count += 1
                elif value < dw_bound:
                    count += 1
            if count != 0:
                outlier_cols.append((head, count))
        except:
            pass
    
    return negative_cols, outlier_cols

x_train_missing_cols = MissingColumn(x_train)
x_test_missing_cols = MissingColumn(x_test)
y_train_missing_cols = MissingColumn(y_train)
# print(x_train_missing_cols, x_test_missing_cols, y_train_missing_cols)
# '환불금액' 결측치 존재

x_train_neg_cols, x_train_out_cols = OutlierColumn(x_train)
x_test_neg_cols, x_test_out_cols = OutlierColumn(x_test)
y_train_neg_cols, y_train_out_cols = OutlierColumn(y_train)
# print(x_train_neg_cols)
# print(x_train_out_cols)
# print(x_test_neg_cols)
# print(x_test_out_cols)
# print(y_train_neg_cols)
# print(y_train_out_cols)
# '총구매액', '최대구매액' 음수값 존재
# '내점일수', '내점당구매건수', '구매주기' 이상값 수정 필요 / '총구매액', '최대구매액', '환불금액'은 수정 X

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

for head in x_train_neg_cols:
    x_train[head] = x_train[head].abs()
    x_test[head] = x_test[head].abs()

out_cols = ['내점일수', '내점당구매건수', '구매주기']
def TransformOutlier(df_, out_cols):
    df__ = df_.copy()
    for head in out_cols:
        Q1 = df__[head].quantile(.25)
        Q3 = df__[head].quantile(.75)
        IQR = Q3-Q1
        up_bound = Q3+IQR*3/2
        dw_bound = Q1-IQR*3/2
        df__.loc[(df_[head] > up_bound), head] = up_bound
        if dw_bound < 0:
            df__.loc[(df_[head] < dw_bound), head] = 0
        else:
            df__.loc[(df_[head] < dw_bound), head] = dw_bound
    return df__
x_train = TransformOutlier(x_train, out_cols)
x_test = TransformOutlier(x_test, out_cols)

x_train_missing_cols = MissingColumn(x_train)
x_test_missing_cols = MissingColumn(x_test)
y_train_missing_cols = MissingColumn(y_train)
# print(x_train_missing_cols, x_test_missing_cols, y_train_missing_cols)
# 결측치 제거완료

x_train_neg_cols, x_train_out_cols = OutlierColumn(x_train)
x_test_neg_cols, x_test_out_cols = OutlierColumn(x_test)
y_train_neg_cols, y_train_out_cols = OutlierColumn(y_train)
# print(x_train_neg_cols)
# print(x_train_out_cols)
# print(x_test_neg_cols)
# print(x_test_out_cols)
# 이상치 제거 완료

# print(y_train.gender.value_counts()) # 여자 2184, 남자 1316, 0.624가 여성
# x_train.info() # '주구매상품', '주구매지점'
# x_test.info()

train_input = pd.merge(x_train, y_train)
train_input = train_input.drop(['cust_id'], axis=1)
x_test = x_test.drop(['cust_id'], axis=1)
x_test.head()

Unnamed: 0,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,70900400,22000000,4050000.0,골프,부산본점,13.0,1.461538,0.789474,26.0
1,310533100,38558000,48034700.0,농산물,잠실점,63.875,2.433333,0.369863,3.0
2,305264140,14825000,30521000.0,가공식품,본 점,63.875,5.8125,0.083277,3.0
3,7594080,5225000,0.0,주방용품,부산본점,5.0,2.0,0.0,47.0
4,1795790,1411200,0.0,수산품,청량리점,3.0,2.666667,0.125,8.0


In [7]:
obj_cols = ['주구매상품', '주구매지점']
num_cols = [head for head in train_input.drop(['gender'], axis=1).keys() if head not in obj_cols]

# num_transformer = Pipeline(steps=[
#     ('scaler', RobustScaler())])
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])
obj_transformer = OneHotEncoder(handle_unknown='ignore')
#obj_transformer = LabelEncoder()

preprocessor = ColumnTransformer(transformers=[
    ('obj', obj_transformer, obj_cols),
    ('num', num_transformer, num_cols)])

seed = 1
K_val = 5; num_repeat=1; scoring='f1'#scoring='roc_auc' #scoring='f1'; 
cv = RepeatedStratifiedKFold(n_splits=K_val, n_repeats= num_repeat, random_state=seed)

train_x, val_x, train_y, val_y = train_test_split(train_input.drop(['gender'], axis=1),
                                                 train_input['gender'],
                                                 test_size = 0.1,
                                                 random_state = seed)

model_1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('LR', LogisticRegression())])

model_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('SVM', SVC(probability=True))])

model_3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('NN', MLPClassifier())])

model_4 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('KNN', KNeighborsClassifier())])

model_5 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RF', RandomForestClassifier())])

models = [('LR', model_1), ('SVM', model_2), ('NN', model_3), ('KNN', model_4), ('RF', model_5)]
names, means, stds = [], [], []
for name, model in models:
    #result = cross_val_score(model, train_x, train_y, cv=cv, scoring=scoring)
    model.fit(train_x,train_y)
    output = model.predict_proba(val_x)
    result = roc_auc_score(val_y, output[:,1])
    names.append(name)
    means.append(result)
    #stds.append(result.std())

result_df = df({'name':names, 'mean':means}).sort_values(['mean'], ascending=False)
print(result_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  name      mean
0   LR  0.711430
1  SVM  0.707154
4   RF  0.700521
2   NN  0.688309
3  KNN  0.655599


In [4]:
help(f1_score)

Help on function f1_score in module sklearn.metrics._classification:

f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
    Compute the F1 score, also known as balanced F-score or F-measure
    
    The F1 score can be interpreted as a weighted average of the precision and
    recall, where an F1 score reaches its best value at 1 and worst score at 0.
    The relative contribution of precision and recall to the F1 score are
    equal. The formula for the F1 score is::
    
        F1 = 2 * (precision * recall) / (precision + recall)
    
    In the multi-class and multi-label case, this is the average of
    the F1 score of each class with weighting depending on the ``average``
    parameter.
    
    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
  

In [49]:
obj_cols = ['주구매상품', '주구매지점']
num_cols = [head for head in train_input.drop(['gender'], axis=1).keys() if head not in obj_cols]

# num_transformer = Pipeline(steps=[
#     ('scaler', RobustScaler())])
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])
obj_transformer = OneHotEncoder(handle_unknown='ignore')
#obj_transformer = LabelEncoder()

preprocessor = ColumnTransformer(transformers=[
    ('obj', obj_transformer, obj_cols),
    ('num', num_transformer, num_cols)])

seed = 1
K_val = 5; num_repeat=1; scoring='roc_auc' #scoring='f1'; 
cv = RepeatedStratifiedKFold(n_splits=K_val, n_repeats= num_repeat, random_state=seed)

train_x, val_x, train_y, val_y = train_test_split(train_input.drop(['gender'], axis=1),
                                                 train_input['gender'],
                                                 test_size = 0.1,
                                                 random_state = seed)

model_1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('LR', LogisticRegression())])

model_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('SVM', SVC(probability=True))])

model_3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('NN', MLPClassifier())])

model_4 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('KNN', KNeighborsClassifier())])

model_5 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RF', RandomForestClassifier())])

models = [('LR', model_1), ('SVM', model_2), ('NN', model_3), ('KNN', model_4), ('RF', model_5)]
names, means, stds = [], [], []
for name, model in models:
    result = cross_val_score(model, train_x, train_y, cv=cv, scoring=scoring)
    names.append(name)
    means.append(result.mean())
    stds.append(result.std())

result_df = df({'name':names, 'mean':means, 'std':stds}).sort_values(['mean'], ascending=False)
print(result_df)



  name      mean       std
0   LR  0.664772  0.011883
4   RF  0.641944  0.014501
1  SVM  0.635478  0.019192
2   NN  0.611699  0.012421
3  KNN  0.590477  0.019312


In [50]:
LR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('LR', LogisticRegression())])

LR_param_grid = {
    'LR__C':[1, 10],
    'LR__class_weight':['balanced'],
    'LR__random_state':[seed],
    'LR__solver':['lbfgs', 'liblinear']
}

LR_grid = GridSearchCV(LR, param_grid=LR_param_grid, scoring=scoring, cv=cv, n_jobs=4, verbose=1)
LR_grid.fit(train_x, train_y)
LR_best = LR_grid.best_estimator_
LR_grid.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    2.9s finished


0.6645164680267793

In [51]:
RF = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RF', RandomForestClassifier())])

RF_param_grid = {
    'RF__n_estimators':[100,300],
    'RF__criterion':["gini", "entropy"],
    'RF__random_state':[seed]}

RF_grid = GridSearchCV(RF, param_grid=RF_param_grid, scoring=scoring, cv=cv, n_jobs=4, verbose=1)
RF_grid.fit(train_x, train_y)
RF_best = RF_grid.best_estimator_
RF_grid.best_score_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:   17.5s finished


0.6475876797764829

In [33]:
SVM = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('SVM', SVC(probability=True))])

SVM_param_grid={
    'SVM__C':[1, 10],
    'SVM__kernel':['linear', 'rbf', 'sigmoid'],
    'SVM__gamma':['auto'],
    'SVM__class_weight':['balanced'],
    'SVM__random_state':[seed]
}

SVM_grid = GridSearchCV(SVM, param_grid=SVM_param_grid, scoring=scoring, cv=cv, n_jobs=4, verbose=1)
SVM_grid.fit(train_x, train_y)
SVM_best = SVM_grid.best_estimator_
SVM_grid.best_score_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   55.7s finished


0.6710593571663857

In [39]:
voting = VotingClassifier(
    estimators=[
        ('LR_best', LR_best),
        ('SVM_best', SVM_best)], voting='soft', n_jobs=4, verbose=1)
voting.fit(train_x,train_y)
prediction = voting.predict_proba(val_x)
print(roc_auc_score(val_y, prediction[:,1]))

0.716603322072072


In [52]:
voting = VotingClassifier(
    estimators=[
        ('LR_best', LR_best),
        ('RF_best', RF_best)], voting='soft', n_jobs=4, verbose=1)
voting.fit(train_x,train_y)
prediction = voting.predict_proba(val_x)
print(roc_auc_score(val_y, prediction[:,1]))

0.7201928490990991


In [58]:
voting = VotingClassifier(
    estimators=[
        ('LR_best', LR_best),
        ('RF_best', RF_best)], voting='soft', n_jobs=4, verbose=1)
voting.fit(train_input.drop(['gender'],axis=1),train_input.gender)
prediction = voting.predict_proba(x_test)

x_test = pd.read_csv(x_test_path, encoding='cp949')
y_test = df({'cust_id':x_test['cust_id'], 'gender':prediction[:,1]})
# y_test.head()
output_path = 'bda_q2/2020311427.csv'
y_test.to_csv(output_path, index=False)
confirm = pd.read_csv(output_path, encoding='cp949')
confirm

Unnamed: 0,cust_id,gender
0,3500,0.607919
1,3501,0.166602
2,3502,0.210615
3,3503,0.421947
4,3504,0.517857
...,...,...
2477,5977,0.620746
2478,5978,0.650755
2479,5979,0.688023
2480,5980,0.493529


In [63]:
prediction[:,1]

array([0.6079194 , 0.16660223, 0.21061459, ..., 0.6880231 , 0.49352888,
       0.55394634])