In [None]:
import pandas as pd
import numpy as np

"../input/"
X_full = pd.read_csv("../input/learn-together/train.csv", 
                index_col=0)
test = pd.read_csv("../input/learn-together/test.csv", 
                   index_col=0)

TARGET = 'Cover_Type'
#X_full[TARGET] = X_full[TARGET].transform(str)
X = X_full.copy()
y = X_full[TARGET]

In [None]:
print(X.shape)
X.head()

In [None]:
y.value_counts()

There are 54 columns or features and 15120 observations.
The dataset is balanced, there are only 7 values for the label (Cover_Type), and each type have same sample size (2160 rows)

It is a multi classification problem.

In [None]:
X.dtypes

Excellent!! all features are numeric
`Soil_TypeX` and `Wilderness_AreaX` are OHE features of categorical variables. Hence they are binary columns.

In [None]:
X.describe()

In [None]:
X.isna().sum().sum()

There is no NANs!!!
Lets train our first model


## Distances analysis

Distances features are: 
- Vertical_Distance_To_Hydrology
- Horizontal_Distance_To_Hydrology
- Horizontal_Distance_To_Roadways
- Horizontal_Distance_To_Fire_Points

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

X['Euclidean_distance_to_hydro'] = (X.Vertical_Distance_To_Hydrology**2 + X.Horizontal_Distance_To_Hydrology**2)**.5

f, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

sns.distplot(X.Horizontal_Distance_To_Hydrology, color="b", ax=axes[0])
sns.distplot(X.Vertical_Distance_To_Hydrology, color="b", ax=axes[1])
sns.distplot(X['Euclidean_distance_to_hydro'], color="g", ax=axes[2])


### Interpretation
The first plot (horizontal distance to hydrology)
As expected, vegetation seems to be more abundant near hydrology.

The second plot (vertical distance), it seems that the negative values ​​could be vegetation superior to hydrology and the positive ones are vegetation inferior to hydrology. Most of the vegetation is in the downhill from hydrology with a huge amount of vegetation concentrated near 0 (high kurtosis), which means that much vegetation is at almost the same level of water.

When calculating the Euclidean distance to hydrology as a heuristic measure, we see that our third graph looks like the first, this is because the horizontal distance has a wider distribution compared to the horizontal distance where almost all values ​​are close to zero. However, this Euclidean distance is also better suited to the line, which will improve our model.

In [None]:
def euclidean(df):
    df['Euclidean_distance_to_hydro'] = (df.Vertical_Distance_To_Hydrology**2 
                                         + df.Horizontal_Distance_To_Hydrology**2)**.5

    return df

X = euclidean(X)
test = euclidean(test)


In [None]:
from itertools import combinations

def distances(df):
    cols = [
        'Horizontal_Distance_To_Roadways',
        'Horizontal_Distance_To_Fire_Points',
        'Horizontal_Distance_To_Hydrology',
    ]
    
    df['distance_mean'] = df[cols].mean(axis=1)
    df['distance_sum'] = df[cols].sum(axis=1)
    df['distance_road_fire'] = df[cols[:2]].mean(axis=1)
    df['distance_hydro_fire'] = df[cols[1:]].mean(axis=1)
    df['distance_road_hydro'] = df[[cols[0], cols[2]]].mean(axis=1)
    
    df['distance_sum_road_fire'] = df[cols[:2]].sum(axis=1)
    df['distance_sum_hydro_fire'] = df[cols[1:]].sum(axis=1)
    df['distance_sum_road_hydro'] = df[[cols[0], cols[2]]].sum(axis=1)
    
    df['distance_dif_road_fire'] = df[cols[0]] - df[cols[1]]
    df['distance_dif_hydro_road'] = df[cols[2]] - df[cols[0]]
    df['distance_dif_hydro_fire'] = df[cols[2]] - df[cols[1]]
    
    # Vertical distances measures
    colv = ['Elevation', 'Vertical_Distance_To_Hydrology']
    
    df['Vertical_dif'] = df[colv[0]] - df[colv[1]]
    df['Vertical_sum'] = df[colv].sum(axis=1)
    
    return df

X = distances(X)
test = distances(test)

## Shade analysis

In [None]:
f, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

sns.distplot(X['Hillshade_9am'], color="y", ax=axes[0])
sns.distplot(X['Hillshade_Noon'], color="b", ax=axes[1])
sns.distplot(X['Hillshade_3pm'], color="g", ax=axes[2])


In [None]:
X[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].kurt()

### Interpretation

Here we can see the variation in the amount of sunlight among three different day hours.
Between 9 am and noon, we see how the sunlight is increasing with a high positive kurtosis (>1) a huge peak in approx 225, that is almost the max value measurable (254). 

By the 3 pm, there is a significant reduction of the light is some zones, (maybe by some hill), now the kurtosis is close to 0.


In [None]:
def shade(df):
    SHADES = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    
    df['shade_noon_diff'] = df['Hillshade_9am'] - df['Hillshade_Noon']
    df['shade_3pm_diff'] = df['Hillshade_Noon'] - df['Hillshade_3pm']
    df['shade_all_diff'] = df['Hillshade_9am'] - df['Hillshade_3pm']
    df['shade_sum'] = df[SHADES].sum(axis=1)
    df['shade_mean'] = df[SHADES].mean(axis=1)
    return df

X = shade(X)
test = shade(test)

## Soil analysis

1=rubbly, 2=stony, 3=very stony, 4=extremely stony


In [None]:
# create a dict that map soil type with rockness
# 0=unknow 1=complex 2=rubbly, 3=stony, 
# 4=very stony, 5=extremely stony 6=extremely bouldery
soils = [
    [7, 15, 8, 14, 16, 17,
     19, 20, 21, 23], #unknow and complex 
    [3, 4, 5, 10, 11, 13],   # rubbly
    [6, 12],    # stony
    [2, 9, 18, 26],      # very stony
    [1, 24, 25, 27, 28, 29, 30,
     31, 32, 33, 34, 36, 37, 38, 
     39, 40, 22, 35], # extremely stony and bouldery
]

soil_dict = dict()
for index, values in enumerate(soils):
    for v in values:
        soil_dict[v] = index
        
        
def soil(df, soil_dict=soil_dict):
    df['Rocky'] =  sum(i * df['Soil_Type'+ str(i)] for i in range(1, 41))
    df['Rocky'] = df['Rocky'].map(soil_dict) 

    return df

X = soil(X)
test = soil(test)

In [None]:
sns.violinplot(x=TARGET, y='Rocky', data=X)


f, axes = plt.subplots(1, 1, figsize=(15, 15), 
                       sharex=True, sharey=True)
sns.scatterplot(x=TARGET, y='Elevation', 
                hue='Rocky', data=X)

c_7 = (X.Cover_Type == 7)
r_4 = X.Rocky != 4


X['Soil_Type'] = sum(X['Soil_Type' + str(i)] * i for i in range(1, 41))
X[c_7][r_4].Soil_Type.value_counts()

X[X['Soil_Type']==35].Cover_Type.value_counts()
X[X['Soil_Type']==23].Cover_Type.value_counts()
X[X['Soil_Type']==4].Cover_Type.value_counts()
X[X['Soil_Type']==21].Cover_Type.value_counts()

## Elevation analysis

Elevation is the most important feature (see feature importance section). Hence we would compare this with other features (bivariate analysis), and make some transformations creating new features that help our tree algos to make better splits.

In [None]:
sns.violinplot(x=TARGET, y='Elevation', data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Hillshade_9am', y='Elevation', 
                hue=TARGET, data=X, y_jitter=True)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Hillshade_Noon', y='Elevation', 
                hue=TARGET, data=X, y_jitter=True)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Hillshade_3pm', y='Elevation', 
                hue=TARGET, data=X, y_jitter=True)

In [None]:

f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Euclidean_distance_to_hydro', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
def elevation(df):
    df['ElevationHydro'] = df['Elevation'] - 0.25 * df['Euclidean_distance_to_hydro']
    return df

X = elevation(X)
test = elevation(test)
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Euclidean_distance_to_hydro', y='ElevationHydro', 
                hue=TARGET, data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Vertical_Distance_To_Hydrology', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
def elevationV(df):
    df['ElevationV'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    return df

X = elevationV(X)
test = elevationV(test)
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Vertical_Distance_To_Hydrology', y='ElevationV', 
                hue=TARGET, data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Horizontal_Distance_To_Hydrology', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
def elevationH(df):
    df['ElevationH'] = df['Elevation'] - 0.19 * df['Horizontal_Distance_To_Hydrology']
    return df

X = elevationH(X)
test = elevationH(test)
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Horizontal_Distance_To_Hydrology', y='ElevationH', 
                hue=TARGET, data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Horizontal_Distance_To_Roadways', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Horizontal_Distance_To_Fire_Points', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='distance_road_fire', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
def kernel_features(df):
    df['Elevation2'] = df['Elevation']**2
    df['ElevationLog'] = np.log1p(df['Elevation'])
    return df

X = kernel_features(X)
test = kernel_features(test)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Aspect', y='Elevation', 
                hue=TARGET, data=X)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(x='Slope', y='Elevation', 
                hue=TARGET, data=X)

## Slope and Aspect analysis
Slope and Aspect are degrees features, from 0 to 360 degrees

First the slope, the min slope is 0 and the max slope is 52, is vertical inclination respect to the horizon. 

Now, Aspect, this feature is in degrees from 0 to 360, I guess refers to cardinal direction of the slope (this is a particular guess). 
I guess 0 refer to North direction for slope, and 180 to the south and so on.

Generally, the sun moves from east to west (since our optical), hence in a mountain at morning the east side is illuminated and the west is still shade, at noon maybe both sides are illuminated, and by afternoon the west side receives more light than the east side.

North and South sides of the mountain, are sides that maximize the sunlight received, and have similar behavior.

In [None]:
X[['Slope', 'Aspect']].describe()

In [None]:

f, axes = plt.subplots(3, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(y='Slope', x='Hillshade_9am', 
                hue='Aspect', data=X, ax=axes[0])
sns.scatterplot(y='Slope', x='Hillshade_Noon', 
                hue='Aspect', data=X, ax=axes[1])
sns.scatterplot(y='Slope', x='Hillshade_3pm', 
                hue='Aspect', data=X, ax=axes[2])


Here we observe the relationship between slope, aspect, shadow and time.

At 9 am, all the flat land (slope near zero) receives the same amount of sunlight (approximately 220),
At noon it goes up to 240 and at 3 pm it goes down to 155.

In addition, we observe how the shadow increases the variability in the same way that the slope increases.
At 9 am, the aspect between 0 and 150 receives more light, and at 3 pm the relationship has changed. 

In [None]:
def degree(df):
    df['Aspect_cos'] = np.cos(np.radians(df.Aspect))
    df['Aspect_sin'] = np.sin(np.radians(df.Aspect))
    #df['Slope_sin'] = np.sin(np.radians(df.Slope))
    df['Aspectcos_Slope'] = df.Slope * df.Aspect_cos
    #df['Aspectsin_Slope'] = df.Slope * df.Aspect_sin
    
    return df

X = degree(X)
test = degree(test)



In [None]:

f, axes = plt.subplots(3, 1, figsize=(15, 15), sharex=True, sharey=True)
sns.scatterplot(y='Slope', x='Hillshade_9am', 
                hue='Aspect_sin', data=X, ax=axes[0])
sns.scatterplot(y='Slope', x='Hillshade_Noon', 
                hue='Aspect_sin', data=X, ax=axes[1])
sns.scatterplot(y='Slope', x='Hillshade_3pm', 
                hue='Aspect_sin', data=X, ax=axes[2])


Beautiful!!

This graphic is revealing.

At 9am, the sunlight is more intense on the slopes facing east, (darker points) and clearly, the larger the slope, the greater the difference in light between the east and west sides (light points).

  The north and south points have intermediate light values and their variability continues to rise in relation to the slope.

At noon, all points seem to intermingle, this is because the sun is more distributed on all sides of the mountain.

By 3pm the difference is marked, this time you can visualize beautifully in this graph as the east side has much less light than the west side.

## Aspect Binning
 

In [None]:
from bisect import bisect

cardinals = [i for i in range(45, 361, 90)]

points = ['N', 'E', 'S', 'W']

def cardinal(df):
    df['Cardinal'] = df.Aspect.apply(lambda x: points[bisect(cardinals, x) % 4])
    return df

X = cardinal(X)
test = cardinal(test)



In [None]:
f, axes = plt.subplots(5, 1, figsize=(15, 25), sharex=True, sharey=True)
sns.scatterplot(x='Slope', y='Hillshade_3pm', 
                hue=TARGET, data=X[X.Cardinal=='E'], ax=axes[0])
sns.scatterplot(x='Slope', y='Hillshade_3pm', 
                hue=TARGET, data=X[X.Cardinal=='W'], ax=axes[1])
sns.scatterplot(x='Slope', y='Hillshade_3pm', 
                hue=TARGET, data=X[X.Cardinal=='N'], ax=axes[2])
sns.scatterplot(x='Slope', y='Hillshade_3pm', 
                hue=TARGET, data=X[X.Cardinal=='S'], ax=axes[3])
sns.scatterplot(x='Slope', y='Hillshade_3pm', 
                hue=TARGET, data=X, ax=axes[4])

In [None]:
def cardinal_num(df):
    d = {'N': 0, 'E': 1, 'S': 0, 'W':-1}
    df['Cardinal'] = df.Cardinal.apply(lambda x: d[x])
    return df

X = cardinal_num(X)
test = cardinal_num(test)

## Wilderness Area

x1, y1 =   'Horizontal_Distance_To_Hydrology', 'ElevationH'
HUE = TARGET  # 'Euclidean_distance_to_hydro'
f, axes = plt.subplots(2, 2, figsize=(25, 25), sharex=True, sharey=True)
sns.scatterplot(x1, y1, hue=HUE, data=X[X.Wilderness_Area1 == 1], ax=axes[0][0])
sns.scatterplot(x1, y1, hue=HUE, data=X[X.Wilderness_Area2 == 1], ax=axes[0][1])
sns.scatterplot(x1, y1, hue=HUE, data=X[X.Wilderness_Area3 == 1], ax=axes[1][0])
sns.scatterplot(x1, y1, hue=HUE, data=X[X.Wilderness_Area4 == 1], ax=axes[1][1])


In [None]:
X['w'] = sum(i * X['Wilderness_Area'+ str(i)] for i in range(1, 5))
test['w'] = sum(i * test['Wilderness_Area'+ str(i)] for i in range(1, 5))

cols = [
    'ElevationH', 'Vertical_dif', 'Euclidean_distance_to_hydro', 
    'Aspectcos_Slope', 'distance_dif_hydro_road', 'Hillshade_9am'
]
n = ['Elev', 'Vert_d', 'Eucli', 'AspSlo', 'dist_hr', 'hillshade']

stats = X.groupby('w')[cols].describe()

for i, col in enumerate(cols):
    name = n[i] + '_mean'
    d = {r: stats[col]['50%'][r] for r in range(1, 5)}
    X[name] = X.w.apply(lambda r: d[r])
    test[name] = test.w.apply(lambda r: d[r])

    
X.drop('w', inplace=True, axis=1)
test.drop('w', inplace=True, axis=1)


In [None]:
# drop label 
if TARGET in X.columns:
    X.drop(TARGET, axis=1, inplace=True)


### Make a model

In [None]:
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
)
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

SEED = 2007

models = {
    'LGBM': LGBMClassifier(n_estimators=370,
                           metric='multi_logloss',
                           num_leaves=100,
                           verbosity=0,
                           random_state=SEED,
                           n_jobs=-1), 
    'Random Forest': RandomForestClassifier(n_estimators=500,
                                            n_jobs=-1,
                                            random_state=SEED),
    'Extra Tree': ExtraTreesClassifier(
           max_depth=400, 
           n_estimators=450, n_jobs=-1,
           oob_score=False, random_state=SEED, 
           warm_start=True)

}


## Feautures importances

In [None]:
clf = models['Random Forest']

def feature_importances(clf, X, y, figsize=(18, 6)):
    clf = clf.fit(X, y)
    
    importances = pd.DataFrame({'Features': X.columns, 
                                'Importances': clf.feature_importances_})
    
    importances.sort_values(by=['Importances'], axis='index', ascending=False, inplace=True)

    fig = plt.figure(figsize=figsize)
    sns.barplot(x='Features', y='Importances', data=importances)
    plt.xticks(rotation='vertical')
    plt.show()
    return importances
    
importances = feature_importances(clf, X, y)    

In [None]:
def select(importances, edge):
    c = importances.Importances >= edge
    cols = importances[c].Features.values
    return cols

col = select(importances, 0.0003)
X = X[col]
test = test[col]    

Let's validate our model(s)

In [None]:
# cross validation
from sklearn.model_selection import KFold, cross_val_score

# model selection functions

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_val(models, X=X, y=y):
    r = dict()
    for name, model in models.items():
        cv_results = cross_val_score(model, X, y,
                             cv=cv, 
                             scoring='accuracy')
        r[name] = cv_results
        print(name, 'Accuracy Mean {0:.4f}, Std {1:.4f}'.format(
              cv_results.mean(), cv_results.std()))
    return r
    
def choose_best(results):
    errors = dict()

    for name, arr in results.items():
        errors[name] = arr.mean()

    best_model =  [m for m, e in errors.items() 
                   if e == max(errors.values())][0]
    return best_model

In [None]:
results = cross_val(models)


In [None]:
best_model_name = choose_best(results)


model = models[best_model_name]

In [None]:
def predict(model, filename, X=X, y=y, test=test):
    model.fit(X, y)
    predicts = model.predict(test)

    output = pd.DataFrame({'ID': test.index,
                       TARGET: predicts})
    output.to_csv(filename+'.csv', index=False)
    return predicts


### Stacked model

In [None]:
estimators = [m for m in models.values()]

stack = StackingCVClassifier(classifiers=estimators,
                             meta_classifier=model,
                             cv=cv,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1,
                             random_state=SEED,
                             n_jobs=-1)

predict_stack = predict(stack, 'stacked')
print('Ready!')