## import

In [4]:
import re
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, scale
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import*
from imblearn.combine import SMOTEENN

## 데이터 불러오기

In [6]:
score = pd.read_csv("score.csv", encoding="cp949")

score_count = pd.read_csv("score_count.csv", encoding="cp949")
score_count_stop = pd.read_csv("score_count_stop.csv", encoding="cp949")

score_tfidf = pd.read_csv("js_w2v_tfidf.csv", encoding="cp949")
score_tfidf_stop = pd.read_csv("jaeseok_w2v_tfidf_stopwords.csv", encoding="cp949")

In [7]:
del score["Unnamed: 0"]
del score_count["Unnamed: 0"]
del score_count["title"]
del score_count_stop["Unnamed: 0"]
del score_count_stop["title"]
del score_tfidf["Unnamed: 0"]
del score_tfidf["title"]
del score_tfidf_stop["Unnamed: 0"]
del score_tfidf_stop["title"]

## 모델 1 

In [None]:
def Modeling1(score):
    
    X = score.drop("y", axis=1)
    y = score["y"]
    
    # train / test 나누기
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=0)
    Y_train, Y_test = train_test_split(y, test_size=0.2, random_state=0)
    
    # 파라미터
    param =[{"n_neighbors": [2,3,4,5,6], 'p':[1,2,3], "weights":["uniform","distance"]},
            {"C": [0.01, 0.03, 0.1, 0.2, 0.3, 0.4, 0.6, 1, 3]},
            {"C": [0.01, 0.03, 0.1, 0.3, 1, 3, 6, 9 ]},
            {"criterion": ['gini','entropy'], 'max_depth':[4,5,6,7,8,9,10]},
            {"max_depth": [12,13,14,15,16],"min_samples_split" :[4,5,6,7,8],"min_samples_leaf": [3,4,5,6,7],
             "max_features": (5,6,7,"sqrt"),"criterion": ('gini','entropy')}]

    # 모델
    clfs = [KNeighborsClassifier(),
            LogisticRegression(penalty='l2',class_weight='balanced', random_state=0),
            LogisticRegression(penalty='l1',class_weight='balanced', random_state=0),
            DecisionTreeClassifier(random_state=0,criterion='entropy'),
            RandomForestClassifier(random_state=0, class_weight="balanced")]

    clfName = {1:'LO', 2:'SV', 3:'L1',4:'DT', 5:'RF'}

    parameters=[]
    val_score=[]
    f1_test= []
    acc_test=[]
    
    # CV = 10 (train 데이터 내에서 train/val 나눠서 CV)
    for i in range(1):
        clfs[i].fit(X_train, Y_train)
        model = GridSearchCV(clfs[i], n_jobs=5, cv=10, scoring = 'f1') #accuracy도
        model.fit(X_train, Y_train)

        parameters.append(model.best_params_)
        val_score.append(model.best_score_

        # cv = 10 한 모델에 test 
        pred = model.predict(X_test)
        print ("finish :",clfName[i+1])
        print(classification_report(Y_test, pred))
        f1_test.append(classification_report(Y_test, pred))
        acc_test.append(sum(pred==Y_test)/len(Y_test))
        
    return parameters, val_score, f1_test, acc_test

In [4]:
param, score_train_scores, st,sa= Modeling1(score)

In [None]:
score_count_param, score_count_train_scores, sst, ssa = Modeling1(score_count)

In [None]:
score_count_stop_param, score_count_stop_train_scores, ssst, sssa = Modeling1(score_count_stop)

In [None]:
score_tfidf_param, score_tfidf_train_scores, sssst,ssssa= Modeling1(score_tfidf)

In [None]:
score_tfidf_stop_param, score_tfidf_stop_train_scores,ssssst,sssssa= Modeling1(score_tfidf_stop)

## 모델 2

In [None]:
def Modeling2(score):
        
    train, test = train_test_split(score, test_size=0.2, random_state=0)

    X_test = test.drop("y",axis=1)
    Y_test = test["y"]

    # 업샘플링
    df_majority = train[train.y==0]
    df_minority = train[train.y==1]
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority),random_state=0) 
    # Combine majority class with upsampled minority class
    train = pd.concat([df_majority, df_minority_upsampled])

    X_train = train.drop("y",axis=1)
    Y_train = train["y"]
    
    # 파라미터
    param =[{"n_neighbors": [2,3,4,5,6], 'p':[1,2,3], "weights":["uniform","distance"]},
            {"C": [0.01, 0.03, 0.1, 0.2, 0.3, 0.4, 0.6, 1, 3]},
            {"C": [0.01, 0.03, 0.1, 0.3, 1, 3, 6, 9 ]},
            {"criterion": ['gini','entropy'], 'max_depth':[4,5,6,7,8,9,10]},
            {"max_depth": [12,13,14,15,16],"min_samples_split" :[4,5,6,7,8],"min_samples_leaf": [3,4,5,6,7],
             "max_features": (5,6,7,"sqrt"),"criterion": ('gini','entropy')}]

    # 모델
    clfs = [KNeighborsClassifier(), 
            LogisticRegression(penalty='l2',class_weight='balanced', random_state=0),
            LogisticRegression(penalty='l1',class_weight='balanced', random_state=0),
            DecisionTreeClassifier(random_state=0,criterion='entropy'),
            RandomForestClassifier(random_state=0, class_weight="balanced")]

    clfName = {1:'KNN', 2:'L2', 3:'L1',4:'DT', 5:'RF'}

    parameters=[]
    scores=[]
    f1= []
    
    # CV = 10
    for i in range(len(clfs)):
        clfs[i].fit(X_train, Y_train)
        model = GridSearchCV(clfs[i], param[i], n_jobs=5, cv=10, scoring = 'f1')
        model.fit(X_train, Y_train)

        parameters.append(model.best_params_)
        scores.append(model.best_score_)

        # cv = 10 한 모델에 test 
        pred = model.predict(X_test)
        print ("finish :",clfName[i+1])
        print(classification_report(Y_test, pred))
        f1.append(classification_report(Y_test, pred))
        
    return parameters, scores, f1

In [4]:
param, score_train_scores, st,sa= Modeling2(score)

In [None]:
score_count_param, score_count_train_scores, sst, ssa = Modeling2(score_count)

In [None]:
score_count_stop_param, score_count_stop_train_scores, ssst, sssa = Modeling2(score_count_stop)

In [None]:
score_tfidf_param, score_tfidf_train_scores, sssst,ssssa= Modeling2(score_tfidf)

In [None]:
score_tfidf_stop_param, score_tfidf_stop_train_scores,ssssst,sssssa= Modeling2(score_tfidf_stop)