# Forest Cover Type Prediction
### 2. Data Optimization and Modeling
![forest-cover-type-coverimage](https://kaggle2.blob.core.windows.net/competitions/kaggle/3936/logos/front_page.png)
<center>https://www.kaggle.com/c/forest-cover-type-prediction</center>


In [58]:
import random
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from mlxtend.classifier import EnsembleVoteClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.learning_curve import learning_curve
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

In [25]:
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import TruncatedSVD

In [26]:
X_scaled_train = pd.read_csv('data/X_scaled_train.csv')
X_scaled_test = pd.read_csv('data/X_scaled_test.csv')

In [27]:
y_train = pd.read_csv('data/train.csv', usecols=['Cover_Type'])

## Optimize data
**Memory Optimization**
* for faster processing like feature selection, model learning, etc.

**Structure Optimization**
* for better performance using `Tree-based model`

In [28]:
X_scaled_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Columns: 326 entries, Aspect to Radians_To_Hydrology_cube
dtypes: float64(326)
memory usage: 37.6 MB


In [29]:
X_scaled_test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565892 entries, 0 to 565891
Columns: 326 entries, Aspect to Radians_To_Hydrology_cube
dtypes: float64(326)
memory usage: 1.4 GB


In [48]:
for col in X_scaled_train.columns:
    if len(X_scaled_train[col].unique()) <= 2:
        X_scaled_train[col] = pd.to_numeric(X_scaled_train[col].astype(int),
                                       downcast='integer')
    else:
        X_scaled_train[col] = pd.to_numeric(X_scaled_train[col],
                                       downcast='float')
    
    if len(X_scaled_test[col].unique()) <= 2:
        X_scaled_test[col] = pd.to_numeric(X_scaled_test[col].astype(int),
                                       downcast='integer')
    else:
        X_scaled_test[col] = pd.to_numeric(X_scaled_test[col],
                                       downcast='float')

In [49]:
X_scaled_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Columns: 326 entries, Aspect to Radians_To_Hydrology_cube
dtypes: float32(229), int8(97)
memory usage: 14.6 MB


In [50]:
X_scaled_test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565892 entries, 0 to 565891
Columns: 326 entries, Aspect to Radians_To_Hydrology_cube
dtypes: float32(229), int8(97)
memory usage: 546.7 MB


## `Soil_Type7` and `Soil_Type15`
* Train set has only one (`0`) class in the `Soil_Type7` and `Soil_Type15` columns
* Test set has two classes (`0`, `1`) in the `Soil_Type7` and `Soil_Type15` columns
* Since our model will not learn the coefficient from the train set given, we drop two columns.

In [51]:
for col in X_scaled_test.columns:
    print(col, len(X_scaled_test[col].unique()))

Aspect 361
Aspect_log 361
Aspect_log_cdf_1 361
Aspect_log_cdf_2 361
Aspect_log_cdf_3 361
Aspect_log_cdf_4 361
Aspect_log_cdf_5 361
Aspect_log_cdf_6 361
Aspect_log_cdf_7 361
Aspect_log_pdf_1 361
Aspect_log_pdf_2 361
Aspect_log_pdf_3 361
Aspect_log_pdf_4 361
Aspect_log_pdf_5 361
Aspect_log_pdf_6 361
Aspect_log_pdf_7 361
Aspect_log_wild_cdf_11 361
Aspect_log_wild_cdf_21 361
Aspect_log_wild_cdf_31 361
Aspect_log_wild_cdf_41 361
Aspect_log_wild_cdf_51 361
Aspect_log_wild_cdf_61 361
Aspect_log_wild_cdf_71 361
Aspect_log_wild_pdf_11 361
Aspect_log_wild_pdf_21 361
Aspect_log_wild_pdf_31 361
Aspect_log_wild_pdf_41 361
Aspect_log_wild_pdf_51 361
Aspect_log_wild_pdf_61 361
Aspect_log_wild_pdf_71 361
Elevation 1974
Elevation_log 1974
Elevation_log_cdf_1 1426
Elevation_log_cdf_2 1732
Elevation_log_cdf_3 1713
Elevation_log_cdf_4 970
Elevation_log_cdf_5 967
Elevation_log_cdf_6 1710
Elevation_log_cdf_7 1013
Elevation_log_pdf_1 1488
Elevation_log_pdf_2 1801
Elevation_log_pdf_3 1855
Elevation_log_pdf_4 

In [42]:
pd.read_csv('data/train.csv', usecols=['Soil_Type7', 'Soil_Type15'])['Soil_Type7'].unique()

array([0], dtype=int64)

In [52]:
pd.read_csv('data/train.csv', usecols=['Soil_Type7', 'Soil_Type15'])['Soil_Type15'].unique()

array([0], dtype=int64)

In [55]:
print(X_scaled_train['Soil_Type7'].unique())
print(X_scaled_train['Soil_Type15'].unique())
print(X_scaled_test['Soil_Type7'].unique())
print(X_scaled_test['Soil_Type15'].unique())

[0]
[0]
[0 1]
[0 1]


In [56]:
X_scaled_train = X_scaled_train.drop(['Soil_Type7', 'Soil_Type15'], axis=1)
X_scaled_test = X_scaled_test.drop(['Soil_Type7', 'Soil_Type15'], axis=1)

## Additional Feature Generation
**Concatenation with binary columns**
* We can also generate combined binary columns from many binary columns just like adding conditions

#### Example (combined binary column = `pclass_sex`)
pclass|sex|**pclass_sex**
---|---|---
3|male|**3male**
1|female|**1female**
3|female|**3female**
1|female|**1female**


In [None]:
binary_columns = X_scaled_test.select_dtypes(include=['integer']).columns


# Extract Feature Ranking using `sklearn.feature_selection.RFECV`
* LinearRegression
* RandomForest

In [None]:
lr = LogisticRegression()
selector_lr = RFECV(lr, cv=5)
selector_lr.fit(X_scaled_train, y_train.values.ravel())

In [64]:
rf = RandomForestClassifier(random_state=0)
selector_rf = RFECV(rf, cv=5)
selector_rf.fit(X_scaled_train, y_train.values.ravel())

KeyboardInterrupt: 

In [None]:
optimized_columns = X_scaled_train[selector.support_]
print(optimized_columns)

### Using frequency encoding

example
```python
encoding = titanic.groupby('Embarked').size() / len(titanic)
titanic['Embarked_encoded'] = titanic['Embarked'].map(encoding)
```