#파일불러오기

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import warnings 
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/gdrive')

train  = pd.read_csv('/content/gdrive/MyDrive/competition_data/train.csv')
test  = pd.read_csv('/content/gdrive/MyDrive/competition_data/test.csv')
real = pd.read_csv('/content/gdrive/MyDrive/competition_data/sample_submission.csv')


Mounted at /content/gdrive


#전처리

In [None]:
# country컬럼 Mapping(원핫) - ex) 'USA' -> 0, 'NLD; -> 1
country_map = {}

for i, c in enumerate(train['country'].unique()):
    country_map[c] = i

train['country'] = train['country'].map(country_map)
test['country'] = test['country'].map(country_map) 


# 이상치제거(age, introelapse, testelapse, surveryelapse, familysize 컬럼) - 상식적으로 불가능한 큰 숫자나 너무 작은 숫자는 0으로 바꿈
train.loc[train.age > 80, 'age'] = 0
train.loc[train.introelapse <= train.introelapse.quantile(0.025), 'introelapse'] = 0
train.loc[train.testelapse <= train.testelapse.quantile(0.025), 'testelapse'] = 0
train.loc[train.surveyelapse <= train.surveyelapse.quantile(0.025), 'surveyelapse'] = 0
train.loc[train.introelapse >= train.introelapse.quantile(0.975), 'introelapse'] = 0
train.loc[train.testelapse >= train.testelapse.quantile(0.975), 'testelapse'] = 0
train.loc[train.surveyelapse >= train.surveyelapse.quantile(0.975), 'surveyelapse'] = 0

train.loc[train.familysize > 50, 'familysize'] = 0

test.loc[test.age > 80, 'age'] = 0
test.loc[test.introelapse <= test.introelapse.quantile(0.025), 'introelapse'] = 0
test.loc[test.testelapse <= test.testelapse.quantile(0.025), 'testelapse'] = 0
test.loc[test.surveyelapse <= test.surveyelapse.quantile(0.025), 'surveyelapse'] = 0
test.loc[test.introelapse >= test.introelapse.quantile(0.975), 'introelapse'] = 0
test.loc[test.testelapse >= test.testelapse.quantile(0.975), 'testelapse'] = 0
test.loc[test.surveyelapse >= test.surveyelapse.quantile(0.975), 'surveyelapse'] = 0

test.loc[test.familysize > 50, 'familysize'] = 0

flipping_columns2 = ['TIPI2', 'TIPI4', 'TIPI6', 'TIPI8', 'TIPI10']
for flip in flipping_columns2: 
    train[flip] = 8 - train[flip]
    test[flip] = 8 - test[flip]

# TIPI 점수
train['Extraversion'] = (train['TIPI1'] + train['TIPI6'])/2 
train['Agreeableness'] = (train['TIPI2'] + train['TIPI7'])/2
train['Conscientiousness'] = (train['TIPI3'] + train['TIPI8'])/2
train['EmotionalStability'] = (train['TIPI4'] + train['TIPI9'] )/2
train['OpennesstoExperiences'] = (train['TIPI5'] + train['TIPI10'] )/2 

test['Extraversion'] = (test['TIPI1'] + test['TIPI6'])/2 
test['Agreeableness'] = (test['TIPI2'] + test['TIPI7'])/2
test['Conscientiousness'] = (test['TIPI3'] + test['TIPI8'])/2
test['EmotionalStability'] = (test['TIPI4'] + test['TIPI9'] )/2
test['OpennesstoExperiences'] = (test['TIPI5'] + test['TIPI10'] )/2 

# index컬럼 제거
train = train.drop(['index'],axis = 1)
test =test.drop(['index'],axis = 1)

# 결측지 처리 (비어있는 건 0으로 채움)
train = train.fillna(0)
test = test.fillna(0)

# train을 target과 feature로 나눠줍니다.
train_x=train.drop(['nerdiness'], axis=1)
train_y=train['nerdiness']

# 정규화 - 컬럼마다 숫자의 규모가 다르므로 0~1사이의 숫자로 변환함. (*주의! test셋으로 scaler.fit하면 대회규정위반)
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# scaler = MinMaxScaler()
scaler = StandardScaler()

scaler.fit(train_x)
scaled_train_x = scaler.transform(train_x)
train_x = pd.DataFrame(scaled_train_x,columns=train_x.columns)

scaled_test_x = scaler.transform(test)
test = pd.DataFrame(scaled_test_x,columns=train_x.columns)

print(train['urban'].describe())

count    15000.000000
mean         2.163867
std          0.728274
min          0.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: urban, dtype: float64


#교차검증

In [None]:
from sklearn.model_selection import StratifiedKFold #교차검증
from xgboost import XGBClassifier #모델
from sklearn.model_selection import GridSearchCV #하이퍼파라미터 튜닝을 위한 불러오기

skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=11) # 5-fold, 성능이 높아지는 것을 확인하기위해 random_state값 고정

xgboost_accuracy=[]

for train_index, test_index in skf.split(train_x, train_y):
    label_train= train_y.iloc[train_index]
    label_test= train_y.iloc[test_index]

    xgboost_model = XGBClassifier(
      n_estimators=1000, 
      random_state=42, 
      n_jobs=1,
      max_depth= 10,

    )

    #그리드 서치 알고리즘(최적의 파라미터 값 찾는데 이용)
    # parameters = {'max_depth':[6, 24, 26, 28, 30], 'n_jobs':[-1,0,2]}
    # grid = GridSearchCV(xgboost_model, parameters,scoring='accuracy')
    # grid.fit(train_x.iloc[train_index], train_y.iloc[train_index])        # 학습
    # print(grid.best_params_)                                              # -> 가장 최적의 파라미터값 출력
    # pred = grid.score(train_x.iloc[test_index],label_test)                # 평가

    xgboost_model.fit(train_x.iloc[train_index], train_y.iloc[train_index])  # 학습
    pred = xgboost_model.score(train_x.iloc[test_index],label_test)          # 평가
    xgboost_accuracy.append(pred)
    print(pred)

print("\n")
print("mean:",np.mean(xgboost_accuracy)) # 성능의 평균


0.767
0.7933333333333333
0.798
0.7826666666666666
0.7926666666666666


mean: 0.7867333333333333


#모델돌리기

In [None]:
from xgboost import XGBClassifier #모델

xgboost_model = XGBClassifier(
      n_estimators=1000, 
      random_state=42, 
      n_jobs=-1,
      max_depth=28,
    )

xgboost_model.fit(train_x, train_y)            # 학습
pred = xgboost_model.predict_proba(test)[:,1]  # 평가 - 확률값으로 반환

print(pred)
print(len(pred))

[0.0067703  0.982239   0.99939215 ... 0.99940586 0.00516763 0.67575353]
35452


#제출파일만들기

In [None]:
submission = pd.read_csv('/content/gdrive/MyDrive/competition_data/sample_submission.csv')

submission

Unnamed: 0,index,nerdiness
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
35447,35447,-1
35448,35448,-1
35449,35449,-1
35450,35450,-1


In [None]:
submission["nerdiness"] = pred
submission

Unnamed: 0,index,nerdiness
0,0,0.006770
1,1,0.982239
2,2,0.999392
3,3,0.937371
4,4,0.993740
...,...,...
35447,35447,0.994856
35448,35448,0.795475
35449,35449,0.999406
35450,35450,0.005168


In [None]:
submission.to_csv("baseline5555.csv", index = False)