In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


RANDOM_STATE = 42

In [2]:
df = pd.read_csv('train.csv')
df.drop(['Id'], axis=1, inplace=True)

In [3]:
df_test_ = pd.read_csv('test.csv')
df_test = df_test_.drop(['Id'], axis=1)

In [4]:
features = df.drop("Cover_Type", axis=1)
target = df["Cover_Type"]

In [5]:
def ein_df(df):
    df_cic = df.copy()
    
    df_cic['Aspect'] = df_cic['Aspect'].apply(lambda x: [x + 180, x - 180][x + 180 > 360])    
    df_cic['Slope'] *= (np.pi/180)
    df_cic['Aspect'] *= (np.pi/180)
    
    df_cic['Hillshade_3pm'] = KNNImputer(n_neighbors=2, weights='distance', missing_values=0.)\
                        .fit(features['Hillshade_3pm'].values.reshape(-1,1))\
                        .transform(df_cic['Hillshade_3pm'].values.reshape(-1,1))
    df_cic['Soil_Type'] = 0
    df_cic['Wilderness_Area'] = 0
    drop_soil = range(9, 11)
    for i in range(1, 41):
        if i in drop_soil:
            df_cic.drop(f'Soil_Type{i}',axis=1, inplace=True)
            continue
        df_cic['Soil_Type'] += df_cic[f'Soil_Type{i}']* i / 10
        df_cic.drop(f'Soil_Type{i}', axis=1, inplace=True)
   
    df_cic['Neg_VDTH'] = (df_cic['Vertical_Distance_To_Hydrology'] < 0).astype(int)
    for i in range(1, 5):
        df_cic['Wilderness_Area'] += df_cic[f'Wilderness_Area{i}'] * i / 10
        df_cic.drop(f'Wilderness_Area{i}', axis=1, inplace=True)
        
    df_cic['Mult_Ele_Soil'] = df_cic['Elevation']*df_cic['Soil_Type']
    df_cic['Mult_Ele_Wild'] = df_cic['Elevation']*df_cic['Wilderness_Area']*2
        
    df_cic.drop(['Soil_Type', 'Wilderness_Area'], axis=1, inplace=True)
    return df_cic

f_cic = ein_df(features)

f_train, f_val, y_train, y_val = train_test_split(
    f_cic, target, test_size=0.0001, random_state=RANDOM_STATE
)

rnd = np.random.RandomState(seed=RANDOM_STATE)  
def split_rnd(label, n):          
    idx = rnd.choice(y_train[(y_train == label)].index, int(y_train.shape[0]/7*n), replace=False)
    return f_train.loc[idx], y_train.loc[idx]

f_spl, y_spl = pd.DataFrame(), pd.Series(dtype=float)
spl_sizes = [0.97, 0.99, 0.51, 0.2, 0.23, 0.5, 0.2]  
for i in range(7):
    f_trn, y_trn = split_rnd(i + 1, spl_sizes[i])
    f_spl = f_spl.append(f_trn)
    y_spl = y_spl.append(y_trn)
    
f_spl, _, y_spl, _ = train_test_split(
    f_spl, y_spl, test_size=0.0001, random_state=RANDOM_STATE
)

clf = RandomForestClassifier(n_jobs=-1, n_estimators=300, max_depth=None, criterion='entropy',
                        class_weight = 'balanced', bootstrap=False, random_state=RANDOM_STATE)
    
mod = make_pipeline(VarianceThreshold(0.01), PolynomialFeatures(2), PCA(29), clf)

In [6]:
f_test = ein_df(df_test)

In [7]:
mod.fit(f_spl, y_spl)

Pipeline(steps=[('variancethreshold', VarianceThreshold(threshold=0.01)),
                ('polynomialfeatures', PolynomialFeatures()),
                ('pca', PCA(n_components=29)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False,
                                        class_weight='balanced',
                                        criterion='entropy', n_estimators=300,
                                        n_jobs=-1, random_state=42))])

In [8]:
pred = mod.predict(f_test)
pred_ = pd.Series(pred, index=df_test_['Id'], name='Cover_Type')