In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

trainDf = pd.read_csv("./train.csv", encoding="utf-8", names=[i for i in range(40)])
targetDf = pd.read_csv("./trainLabels.csv", encoding="utf-8", names=["target"])
testDf = pd.read_csv("./test.csv", encoding="utf-8")
print("看一下train的形狀: ", trainDf.shape)
print("看一下train target的形狀: ", targetDf.shape)
print("看一下test的形狀: ", testDf.shape)

看一下train的形狀:  (1000, 40)
看一下train target的形狀:  (1000, 1)
看一下test的形狀:  (8999, 40)


In [14]:
df = pd.concat([trainDf, targetDf], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,target
0,0.299403,-1.226624,1.498425,-1.17615,5.289853,0.208297,2.404498,1.594506,-0.051608,0.663234,...,-0.62299,-1.833057,0.293024,3.552681,0.717611,3.305972,-2.715559,-2.682409,0.10105,1
1,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761,0
2,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436,0
3,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397,1
4,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332,0


In [15]:
x_train, x_test, y_train, y_test = train_test_split(trainDf, targetDf, test_size=0.25, random_state=1)

rfc = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=None)
rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)
acc = accuracy_score(y_test, pred)
print("準確率: ", acc)

準確率:  0.884


In [18]:
params = {
    "n_estimators":[i+1 for i in range(0,100,5)],
    "max_depth":[i+1 for i in range(20)]
}

grid = GridSearchCV(rfc, params,cv=10, n_jobs=4, verbose=1)
gridResult = grid.fit(x_train, y_train)

print(f'''
看一下最佳分數: {gridResult.best_score_}
看一下最佳參數組合: {gridResult.best_params_}
''')

Fitting 10 folds for each of 400 candidates, totalling 4000 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 200 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 1254 tasks      | elapsed:   25.5s
[Parallel(n_jobs=4)]: Done 2244 tasks      | elapsed:   54.2s
[Parallel(n_jobs=4)]: Done 3644 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 3993 out of 4000 | elapsed:  1.8min remaining:    0.1s
[Parallel(n_jobs=4)]: Done 4000 out of 4000 | elapsed:  1.8min finished

看一下最佳分數: 0.868
看一下最佳參數組合: {'max_depth': 11, 'n_estimators': 96}



In [19]:
rfc = RandomForestClassifier(n_estimators=96, criterion="gini",max_depth=11)
rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)
acc = accuracy_score(y_test, pred)
print("使用最佳參數後的準確率: ", acc)

使用最佳參數後的準確率:  0.888


In [23]:
'''

試試看另外一種演算法

'''

gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3)
gbc.fit(x_train, y_train)
pred = gbc.predict(x_test)
acc = accuracy_score(y_test, pred)
print("使用GB演算法 準確率: ", acc)

使用GB演算法 準確率:  0.876


In [25]:
grid = GridSearchCV(gbc, params, n_jobs=4, verbose=1)
gridResult = grid.fit(x_train, y_train)

print(f'''
看一下最佳分數: {gridResult.best_score_}
看一下最佳參數組合: {gridResult.best_params_}
''')

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  80 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done 404 tasks      | elapsed:   21.0s
[Parallel(n_jobs=4)]: Done 654 tasks      | elapsed:   46.2s
[Parallel(n_jobs=4)]: Done 1004 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 1454 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:  3.7min finished

看一下最佳分數: 0.8733333333333333
看一下最佳參數組合: {'max_depth': 5, 'n_estimators': 86}



In [26]:
gbc = GradientBoostingClassifier(n_estimators=86, max_depth=5)
gbc.fit(x_train, y_train)
pred = gbc.predict(x_test)
acc = accuracy_score(y_test, pred)
print("使用GB演算法並使用最佳參數 準確率: ", acc)

使用GB演算法並使用最佳參數 準確率:  0.892


In [28]:
'''

採用 GB 、參數使用 n_estimators=86, max_depth=5 

'''

df = pd.read_csv("./test.csv", encoding="utf-8", names=[i+1 for i in range(40)])

pred = gbc.predict(df)
pred

array([1, 0, 1, ..., 1, 0, 1], dtype=int64)

In [30]:
dict_ = {
    "Id":[i for i in range(1,9001)],
    "Solution":pred
}

df = pd.DataFrame(dict_)
df.head()

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0


In [32]:
df.to_csv("./predict.csv", encoding="utf-8", index=False)