In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
import datetime

In [2]:
columns_for_bins = ['Elevation', 'Horizontal_Distance_To_Roadways']

**_Get data_**

In [3]:
dataset = pd.read_csv('train.csv')
dataset.drop("Id", axis=1, inplace=True)

**_Get test data_**

In [4]:
test_data = pd.read_csv('test.csv')
id_column = test_data["Id"].copy(deep=True)
test_ids = test_data["Id"]
test_data.drop("Id", axis=1, inplace=True)
feature_cols = dataset.columns.to_list()
feature_cols.remove("Cover_Type")
X_test = test_data[feature_cols]

**_Featire engineering average distances_**

In [5]:
dataset["Average_Elev_Vert_Hydr"] = dataset[['Elevation', 'Vertical_Distance_To_Hydrology']].mean(axis=1)
X_test["Average_Elev_Vert_Hydr"] = X_test[['Elevation', 'Vertical_Distance_To_Hydrology']].mean(axis=1)

dataset["Average_Hydro_Road"] = dataset[['Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways']].mean(axis=1)
X_test["Average_Hydro_Road"] = X_test[['Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways']].mean(axis=1)

dataset["Average_Hydro_Fire"] = dataset[['Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points']].mean(axis=1)
X_test["Average_Hydro_Fire"] = X_test[['Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points']].mean(axis=1)

dataset["Average_Road_Fire"] = dataset[['Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points']].mean(axis=1)
X_test["Average_Road_Fire"] = X_test[['Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points']].mean(axis=1)

**_Feature engineering average degrees_**

In [6]:
dataset["Average_Hillshade"] = dataset[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].mean(axis=1)
X_test["Average_Hillshade"] = X_test[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].mean(axis=1)

**_Feature engineering Substractions_**

In [7]:
def calc_diff(a, b):
    return abs(b - a)
dataset['Substr_Hydro_Road'] = dataset.apply(lambda x: calc_diff(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways']), axis=1)
X_test['Substr_Hydro_Road'] = X_test.apply(lambda x: calc_diff(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways']), axis=1)

dataset['Substr_Road_Fire'] = dataset.apply(lambda x: calc_diff(x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Substr_Road_Fire'] = X_test.apply(lambda x: calc_diff(x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

dataset['Substr_Aspect_Slope'] = dataset.apply(lambda x: calc_diff(x['Aspect'], x['Slope']), axis=1)
X_test['Substr_Aspect_Slope'] = X_test.apply(lambda x: calc_diff(x['Aspect'], x['Slope']), axis=1)

dataset['Substr_Elev_Vert'] = dataset.apply(lambda x: calc_diff(x['Elevation'], x['Vertical_Distance_To_Hydrology']), axis=1)
X_test['Substr_Elev_Vert'] = X_test.apply(lambda x: calc_diff(x['Elevation'], x['Vertical_Distance_To_Hydrology']), axis=1)

dataset['Substr_Hillshade_noon_3'] = dataset.apply(lambda x: calc_diff(x['Hillshade_Noon'], x['Hillshade_3pm']), axis=1)
X_test['Substr_Hillshade_noon_3'] = X_test.apply(lambda x: calc_diff(x['Hillshade_Noon'], x['Hillshade_3pm']), axis=1)

dataset['Substr_Elev_Road'] = dataset.apply(lambda x: calc_diff(x['Elevation'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Substr_Elev_Road'] = X_test.apply(lambda x: calc_diff(x['Elevation'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

def calc_diff_3(a, b, c):
    return abs(b - a - c)

dataset['Substr_Horizontal_Dist'] = dataset.apply(lambda x: calc_diff_3(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Substr_Horizontal_Dist'] = X_test.apply(lambda x: calc_diff_3(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

**_Feature Engineering Additions_**

In [8]:
def calc_add(a, b):
    return b + a
dataset['Add_Hydro_Road'] = dataset.apply(lambda x: calc_add(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways']), axis=1)
X_test['Add_Hydro_Road'] = X_test.apply(lambda x: calc_add(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways']), axis=1)

dataset['Add_Hydro_Fire'] = dataset.apply(lambda x: calc_add(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Add_Hydro_Fire'] = X_test.apply(lambda x: calc_add(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

dataset['Add_Road_Fire'] = dataset.apply(lambda x: calc_add(x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Add_Road_Fire'] = X_test.apply(lambda x: calc_add(x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

dataset['Add_elev_road'] = dataset.apply(lambda x: calc_add(x['Elevation'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Add_elev_road'] = X_test.apply(lambda x: calc_add(x['Elevation'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

def calc_add_3(a, b, c):
    return b + a + c

dataset['Add_Horizontal_Dist'] = dataset.apply(lambda x: calc_add_3(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)
X_test['Add_Horizontal_Dist'] = X_test.apply(lambda x: calc_add_3(x['Horizontal_Distance_To_Hydrology'], x['Horizontal_Distance_To_Roadways'], x['Horizontal_Distance_To_Fire_Points']), axis=1)

**_Eucledean distance_**

In [9]:
dataset['Distanse_to_Hydrolody'] = (dataset['Horizontal_Distance_To_Hydrology'] ** 2 + 
                                     dataset['Vertical_Distance_To_Hydrology'] ** 2) ** 0.5
X_test['Distanse_to_Hydrolody'] = (X_test['Horizontal_Distance_To_Hydrology'] ** 2 + 
                                     X_test['Vertical_Distance_To_Hydrology'] ** 2) ** 0.5

**_Normalize data_**

In [10]:
columns_for_normalization = ['Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Horizontal_Distance_To_Roadways', 'Add_Horizontal_Dist', 'Add_elev_road']
dataset[columns_for_normalization] = normalize(dataset[columns_for_normalization])
X_test[columns_for_normalization] = normalize(X_test[columns_for_normalization])

In [11]:
dataset.drop(dataset.index[dataset['Soil_Type7'] == 1], inplace=True)
dataset.drop(dataset.index[dataset['Soil_Type8'] == 1], inplace=True)
dataset.drop(dataset.index[dataset['Soil_Type15'] == 1], inplace=True)

**_Combine Soil_**

In [12]:
df_train_1_2 = dataset[(dataset['Cover_Type'] <= 2)]
df_train_3_4_6 = dataset[(dataset['Cover_Type'].isin([3,4,6]))]

X = dataset.drop("Cover_Type", axis=1)

X_train_1_2 = df_train_1_2.drop("Cover_Type", axis=1)
X_train_3_4_6 = df_train_3_4_6.drop("Cover_Type", axis=1)

y = dataset['Cover_Type']
y_1_2 = df_train_1_2['Cover_Type']
y_3_4_6 = df_train_3_4_6['Cover_Type']

In [13]:
X.shape

(15119, 72)

In [14]:
y.shape

(15119,)

In [15]:
X_test.shape

(565892, 72)

In [16]:
from sklearn.ensemble import ExtraTreesClassifier
def two_largest_indices(inlist):
    largest = 0
    second_largest = 0
    largest_index = 0
    second_largest_index = -1
    for i in range(len(inlist)):
        item = inlist[i]
        if item > largest:
            second_largest = largest
            second_largest_index = largest_index
            largest = item
            largest_index = i
        elif largest > item >= second_largest:
            second_largest = item
            second_largest_index = i        
    return largest_index, second_largest_index    


clf = ExtraTreesClassifier(n_estimators=500, random_state=42, max_depth=31, min_samples_split=2, criterion='entropy',
                          max_features=12, n_jobs=-1)
clf.fit(X, y)

clf_1_2 = ExtraTreesClassifier(n_estimators=500, random_state=42, max_depth=31, min_samples_split=2, criterion='gini',
                          max_features=12, n_jobs=-1)
clf_1_2.fit(X_train_1_2, y_1_2)

clf_3_4_6 = ExtraTreesClassifier(n_estimators=500, random_state=42, max_depth=31, min_samples_split=2, criterion='gini',
                          max_features=12, n_jobs=-1)
clf_3_4_6.fit(X_train_3_4_6, y_3_4_6)


vals_1_2 = {}
for e, val in enumerate(list(clf_1_2.predict_proba(X_test))):
    vals_1_2[e] = val


vals_3_4_6 = {}
for e, val in enumerate(list(clf_3_4_6.predict_proba(X_test))):
    vals_3_4_6[e] = val 


vals = {}
for e, val in enumerate(list(clf.predict(X_test))):
    vals[e] = val 
    

with open("submission.csv", "w") as outfile:
    outfile.write("Id,Cover_Type\n")
    for e, val in enumerate(list(clf.predict_proba(X_test))):
        val[0] += vals_1_2[e][0]/1.3
        val[1] += vals_1_2[e][1]/1.1
        val[2] += vals_3_4_6[e][0]/3.4
        val[3] += vals_3_4_6[e][1]/4.0
        val[5] += vals_3_4_6[e][2]/3.6
        i,j = two_largest_indices(val)
        v = i  + 1
        outfile.write("%s,%s\n"%(test_ids[e],v))

Combinations tried:
    - standard scaler

1 - Spruce/Fir

2 - Lodgepole Pine

3 - Ponderosa Pine

4 - Cottonwood/Willow

5 - Aspen

6 - Douglas-fir

7 - Krummholz