In [35]:
import pandas as pd
import numpy as np
import re
from scipy.stats import skew
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder

In [36]:
df = pd.read_csv("score_dasol_final.csv", encoding="cp949")

In [37]:
def preprocessiong(score):
    # 참여인원 (int) 숫자만 남기기
    score["참여인원"] = [k.replace("명","") for k in score["참여인원"]]
    score["참여인원"] = [k.replace("\r","") for k in score["참여인원"]]
    score["참여인원"] = [k.replace("\n","") for k in score["참여인원"]]
    score["참여인원"] = [int(k) for k in score["참여인원"]]
    
    # 글쎄요/보고싶어요 0인 데이터 지우기
    score = score[score["글쎄요"] != 0]
    score = score[score["보고싶어요"] != 0]
    
    score["관심인원"] = score["글쎄요"] + score["보고싶어요"]
    score["기대지수"] = score["보고싶어요"] / score["관심인원"]
    
    # id 한글만 남기고 / id 길이 변수 만들기
    score["id"] = [re.sub("[2, , :, ,]","", id) for id in score["id"]]
    score["id_length"] = [len(i) for i in score["id"]]
    
    # 런타임 (int) 숫자만 남기기
    score["런타임"] = [k.replace("분","") for k in score["런타임"]]
    score["런타임"] = [int(k) for k in score["런타임"]]
    
    # 등급 
    score["등급"] = [k.replace("관람가","") for k in score["등급"]]
    score["등급"] = [k.replace(" ","") for k in score["등급"]]
    score["등급"] = LabelEncoder().fit_transform(score["등급"])
    num_cols = score["등급"].max()
    
    for i in range(num_cols):
        col_name = "등급" + '_' + str(i)
        score[col_name] = score["등급"].apply(lambda x: 1 if x == i else 0)
        
    # 개봉월 / 개봉년도 나누기
    score["개봉년도"] = [k.split(".")[0] for k in score["개봉일"]]
    score["개봉년도"] = [int(k) for k in score["개봉년도"]]
    score["개봉월"] = [k.split(".")[1] for k in score["개봉일"]]
    score["개봉월"] = [int(k) for k in score["개봉월"]]

    # 주연 수 만들기
    score["주연수"] = [len(k.split(",")) for k in score["주연"]]
    score["조연수"] = [len(k.split(",")) for k in score["조연"]]
    score["배우수"] = score["주연수"] + score["조연수"]


    # 장르 string 필요없는 것 제거
    k = [i.replace("[","") for i in score["장르"]]
    k = [i.replace("]","") for i in k]
    k = [i.replace("'","") for i in k]
    k = [i.replace(" ","") for i in k]
    k = [i.split(",") for i in k] 
    score["장르"] = k
    genre_unique = np.unique([b for a in score["장르"] for b in a])
    result = []
    for b in score["장르"]:
        d=[]
        for a in genre_unique:
            d.append(int(a in b))
        result.append(d)
    genre_dummy = pd.DataFrame(result, columns=genre_unique)
    
    score["보고싶어요-2"] = score["보고싶어요"]**2
    score["기대지수-2"] = score["기대지수"]**2
    #score["참여인원-2"] = score["참여인원"]**2
    score["id_length-2"] = score["id_length"]**2
    
    return score, genre_dummy

In [38]:
score, dummy = preprocessiong(df)
score.index = [i for i in range(111)]
score = score.join(dummy)

In [41]:
def delete(score):
    del score["Unnamed: 0"]
    del score["장르"]
    del score["등급"]
    del score["개봉일"]
    del score["주연"]
    del score["조연"]
    del score["id"]
    del score["감독"]
delete(score)

In [44]:
y = score["y"]
score = score.drop("y", axis=1)

In [45]:
from sklearn.cross_validation import train_test_split

train_X, test_X = train_test_split(score, test_size = 0.2, random_state=1) # 독립변수만
train_Y, test_Y = train_test_split(y, test_size = 0.2, random_state=1) # 종속변수만

In [46]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(train_X)
train_X_std = sc.transform(train_X)
test_X_std = sc.transform(test_X)

In [47]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(criterion="entropy", n_estimators = 100, n_jobs=2, random_state=0)
rf1.fit(train_X_std, train_Y)
pred_rf1 = rf1.predict(test_X_std) 
print('Accuracy: %.2f' % accuracy_score(test_Y, pred_rf1))

Accuracy: 0.74


In [48]:
coef = pd.Series(rf1.feature_importances_, index = train_X.columns).sort_values(ascending=False)
coef.head(5)

보고싶어요      0.106417
기대지수       0.100114
보고싶어요-2    0.083005
글쎄요        0.074157
참여인원       0.073080
dtype: float64

In [60]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(estimator=rf1,X=train_X,y=train_Y,cv=7,n_jobs=1) 
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [ 0.76923077  0.61538462  0.69230769  0.84615385  0.61538462  1.
  0.72727273]
CV accuracy: 0.752 +/- 0.127


In [62]:
import statsmodels.api as sm

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [63]:
result = stepwise_selection(train_X, train_Y)

print('resulting features:')
print(result)

Add  기대지수-2                         with p-value 0.00119124
Add  평점                             with p-value 0.00177555
Add  id_length                      with p-value 0.00161589
Add  개봉년도                           with p-value 0.00379809
resulting features:
['기대지수-2', '평점', 'id_length', '개봉년도']


In [64]:
score.columns

Index(['글쎄요', '보고싶어요', '참여인원', '평점', '런타임', '관심인원', '기대지수', 'id_length',
       '등급_0', '등급_1', '등급_2', '개봉년도', '개봉월', '주연수', '조연수', '배우수', '보고싶어요-2',
       '기대지수-2', 'id_length-2', 'SF', '가족', '공포', '느와르', '다큐멘터리', '드라마', '로맨스',
       '멜로', '모험', '미스터리', '범죄', '서사', '서스펜스', '스릴러', '액션', '전쟁', '코미디',
       '판타지'],
      dtype='object')

In [65]:
score_feature_select = score.drop([ '기대지수-2','평점', 'id_length', '개봉년도'], axis=1)

In [66]:
from sklearn.cross_validation import train_test_split

train_X, test_X = train_test_split(score_feature_select, test_size = 0.3, random_state=1) # 독립변수만
train_Y, test_Y = train_test_split(y, test_size = 0.3, random_state=1) # 종속변수만

In [68]:
rf2 = RandomForestClassifier(criterion="entropy", n_estimators = 100, n_jobs=2, random_state=0)
scores = cross_val_score(estimator=rf2,X=train_X,y=train_Y,cv=7,n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [ 0.5         0.66666667  0.63636364  0.72727273  0.72727273  0.7         0.7       ]
CV accuracy: 0.665 +/- 0.074


In [None]:

from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)