In [54]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [55]:
train = pd.read_csv('train-data.csv', index_col="Id")
test = pd.read_csv('test-data.csv', index_col='Id')

# Preprocessing

In [56]:
#train = train.drop(['Soil_Type40', 'Wilderness_Area4'], axis=1)
#test = test.drop(['Soil_Type40', 'Wilderness_Area4'], axis=1)

##### Drop dummies that are not explaining anything

In [57]:

train = train.drop(['Soil_Type15', 'Wilderness_Area4', 'Soil_Type7', 'Soil_Type8', 
                    'Soil_Type9', 'Soil_Type21', 'Soil_Type25', 'Soil_Type27', 
                    'Soil_Type28', 'Soil_Type36', 'Soil_Type19','Soil_Type26',
                   'Soil_Type34','Soil_Type37'], axis=1)
test = test.drop(['Soil_Type15', 'Wilderness_Area4', 'Soil_Type7', 'Soil_Type8', 
                    'Soil_Type9', 'Soil_Type21', 'Soil_Type25', 'Soil_Type27', 
                  'Soil_Type28', 'Soil_Type36','Soil_Type19','Soil_Type26',
                   'Soil_Type34','Soil_Type37'], axis=1)

In [58]:
# Convert to X_train, X_test, y_test
y_train = train['Cover_Type']
X_train = train.drop('Cover_Type', axis =1)
X_test = test

# Modeling

#### Simple KNN, K=1

In [59]:

neigh = KNeighborsClassifier(algorithm='ball_tree', leaf_size=2, metric='manhattan',
           metric_params=None, n_jobs=1, n_neighbors=1, p=1,
           weights='uniform')
neigh.fit(X_train, y_train)
y_predicted_knn = neigh.predict(X_test)
# score 0.68466

#### SVM Resulting from grid search

In [60]:


clf = SVC(C=6.25, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
clf.fit(X_train, y_train)
y_predicted_svm = clf.predict(X_test)
# 0.73666 still need to be tuned...

#### RandomForest from grid search

In [43]:
RF = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
RF.fit(X_train, y_train)
y_predicted_rf = RF.predict(X_test)
#0.60600

#### Gradient Boosting from grid search:

In [61]:
grad = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
grad.fit(X_train, y_train)
y_predicted_grad = grad.predict(X_test)
#score 0.649

In [63]:
df_y = pd.DataFrame({'knn':y_predicted_knn, 'svm':y_predicted_svm, 'grad':y_predicted_grad})
df_y

Unnamed: 0,grad,knn,svm
0,7,7,1
1,1,7,7
2,1,7,7
3,1,7,7
4,1,1,1
5,1,1,1
6,1,1,1
7,1,1,1
8,1,1,1
9,1,1,1


#### Building a voting classifier

In [64]:
def vote_label(knn, svm, grad):
    labels = [1, 2, 3, 4, 5, 6, 7]
    count = 0
    votes = [0, 0, 0, 0, 0, 0, 0]
    for i in labels:
        if knn == i:
            votes[count] = votes[count] + 0.68466
        if svm == i:
            votes[count] = votes[count] + 0.73666
        #if rf == i:
            #votes[count] = votes[count] + 0.60600
        if grad ==i:
            votes[count] = votes[count] + 0.64999
        count = count+1
    max_index = votes.index(max(votes))
    return labels[max_index]
df_y['total'] = df_y.apply(lambda x: vote_label(x['knn'], x['svm'], x['grad']), axis=1) 
#0.68466 0.73666 0.60600 0.64999

In [65]:
df_y

Unnamed: 0,grad,knn,svm,total
0,7,7,1,7
1,1,7,7,7
2,1,7,7,7
3,1,7,7,7
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


In [None]:
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(estimators=[
        ('knn', neigh), ('svm', clf), ('rf', RF)], voting='hard', weights=[0.68466, 0.73666, 0.60600])
vote.fit(X_train, y_train)
y_predicted = vote.predict(X_test)

# Save to csv file

In [66]:
y_predicted = [int(i) for i in df_y['total'] ]
df_to_csv = pd.DataFrame({'Cover_Type':y_predicted}, index=test.index)
df_to_csv.head()

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,7
15122,7
15123,7
15124,7
15125,1


In [67]:
df_to_csv.to_csv('answer.csv')

In [68]:
df_to_csv['Cover_Type'].value_counts()

2    10494
1     6279
6     1216
3      890
5      708
7      411
4        2
Name: Cover_Type, dtype: int64

## Tests and gridsearch:

In [None]:
xtrain, xtest,ytrain,ytest = train_test_split(X_train, y_train, test_size = 0.2)

In [None]:
scores = []
for k in range(1, len(xtest), 4):
    print(k)
    neigh = KNeighborsClassifier(algorithm='ball_tree', leaf_size=20, metric='manhattan',
           metric_params=None, n_jobs=-1, n_neighbors=k, p=1,
           weights='uniform')
    neigh.fit(xtrain, ytrain)
    y_predicted = neigh.predict(xtest)
    res = y_predicted - ytest
    res=list(res)
    scores.append(res.count(0)/len(ytest))

In [20]:
%%time
import os
# Grid search SVM
parameters = {'gamma':[0.0001, 0.00001], 'C':[5, 5.5, 6, 6.5, 7, 7.5, 8]}
svc = SVC(kernel='rbf')
clf = GridSearchCV(svc, parameters, n_jobs=-1)
clf.fit(X_train, y_train)
os.system('say "Gridsearch nullissime, ca ne score même pas 0.5"')

CPU times: user 7.61 s, sys: 169 ms, total: 7.78 s
Wall time: 4min 11s


In [21]:
clf.best_estimator_

SVC(C=8, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
%%time
parameters = {'n_neighbors':[1, 2, 3, 4], 'leaf_size':[1, 2, 3, 4]}
neigh = KNeighborsClassifier(algorithm='ball_tree', metric='manhattan',
           metric_params=None, p=1,
           weights='uniform')
neighcv = GridSearchCV(neigh, parameters, n_jobs=-1)
neighcv.fit(X_train, y_train)

In [None]:
neighcv.best_estimator_

In [None]:
RF = RandomForestClassifier(n_jobs=-1)
RF.get_params()

In [None]:
%%time
parameters = {'min_impurity_decrease':[0, 1, 10, 100], 'min_samples_leaf':[1, 10, 100, 1000], 
              'min_samples_split':[2, 10, 100], 'n_estimators':[1, 10, 100, 1000]}
RFcv = GridSearchCV(RF, parameters, n_jobs=-1)
RFcv.fit(X_train, y_train)

In [None]:
RFcv.best_estimator_

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
%%time
clf = SGDClassifier(loss="hinge", penalty="l2")
parameters = {'alpha': [0.0001, 0.00001], 'power_t':[2, 3, 4, 5, 6, 7, 8, 9, 10]}
clfcv = GridSearchCV(clf, parameters, n_jobs=-1)
clfcv.fit(X_train, y_train)

In [None]:
clfcv.best_estimator_

In [None]:
from sklearn.kernel_approximation import RBFSampler
rbf_feature = RBFSampler(gamma=1, random_state=1)
parameters = {'alpha': [0.0001, 0.00001], 'power_t':[2, 3, 4, 5, 6, 7, 8, 9, 10]}
rbf_featurecv = GridSearchCV(clf, parameters, n_jobs=-1)
rbf_featurecv.fit(X_train, y_train)
#X_features = rbf_feature.fit_transform(X)

In [None]:
%%time
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(SVC(C=6.25, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), max_samples=0.5, max_features=0.5)
parameters = {'max_samples':[0.1, 0.25, 0.5, 0.75, 1], 'max_features':[0.1, 0.25, 0.5, 0.75, 1]}
baggingcv = GridSearchCV(bagging, parameters, n_jobs=-1)
baggingcv.fit(X_train, y_train)

In [None]:
baggingcv.best_estimator_

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

In [14]:
grad = GradientBoostingClassifier()
grad.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [24]:
%%time
parameters = {'max_depth': [10, 12.5, 15], 'n_estimators':[100]}
gradcv = GridSearchCV(grad, parameters, n_jobs=-1)
gradcv.fit(X_train, y_train)

CPU times: user 1min 27s, sys: 901 ms, total: 1min 28s
Wall time: 5min 50s


In [25]:
gradcv.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [16]:
from sklearn.ensemble import AdaBoostClassifier

In [17]:
ada = AdaBoostClassifier()

In [19]:
ada.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [23]:
%%time
parameters = {'learning_rate': [0.1,0.25, 0.5, 0.75, 1], 'n_estimators':[50, 75, 100, 1000]}
adacv = GridSearchCV(ada, parameters, n_jobs=-1)
adacv.fit(X_train, y_train)

CPU times: user 1.44 s, sys: 92.1 ms, total: 1.53 s
Wall time: 1min 21s


In [24]:
adacv.best_estimator_

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)

In [31]:
for i in X_train.shape[1]:
    print(summary(X_train[][i]))

SyntaxError: invalid syntax (<ipython-input-31-0a1e2aef5725>, line 2)

In [75]:
X_test.shape

(20000, 44)

In [35]:
for i in X_test.columns.values:
    print(i)
    print(X_train[i].value_counts())
    print('__________________')

Elevation
2290    25
2830    25
3371    24
3244    23
2820    23
2955    23
2795    23
2952    23
2962    22
2304    22
2809    22
2978    22
2413    22
2707    22
2850    22
2763    22
2289    21
2739    21
2827    21
2784    21
2807    21
2328    21
2311    20
3256    20
2751    20
2336    20
2340    20
2317    20
2264    20
3400    20
        ..
3529     1
3537     1
3722     1
3495     1
3491     1
3721     1
1932     1
3559     1
3753     1
3761     1
3591     1
3475     1
3499     1
3836     1
1997     1
3844     1
1901     1
1925     1
1965     1
1973     1
1989     1
3731     1
3523     1
3675     1
3643     1
3635     1
3603     1
3555     1
3497     1
3737     1
Name: Elevation, Length: 1665, dtype: int64
__________________
Aspect
45     117
0      110
90     109
63      89
76      87
27      82
315     81
75      80
108     79
117     78
34      77
72      77
121     77
135     75
80      75
57      75
53      74
62      73
124     71
86      71
61      71
111     70
18     

In [None]:
# Il faut drop Soil_Type 15 et Soil_Type7
# Il faut aussi drop Soil_Type8, Soil_Type9, Soil_Type21, 25, 27, 28, 36


In [48]:
X_train['Soil_Type1'].value_counts()

0    14765
1      355
Name: Soil_Type1, dtype: int64