# Feature Engineering & Prediction

## 0. Setup & Data Import

In [368]:
# import all the necessary libraries here
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [369]:
# load the train and test data
# !! use a relative path so that we don't need to change it each time we run on a different computer

# load the training data
train_path = '../data/train.csv'
train_data = pd.read_csv(train_path)

# load the testing data
test_path = '../data/test-full.csv'
test_data = pd.read_csv(test_path)

In [350]:
# quickly visualise the datasets
print(train_data.shape)
print(test_data.shape)


(15120, 56)
(581012, 55)


In [351]:
train_data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,242642,2881,130,22,210,54,1020,250,221,88,...,0,0,0,0,0,0,0,0,0,1
1,309891,3005,351,14,242,-16,1371,194,215,159,...,0,0,0,0,0,0,0,0,0,1
2,287847,3226,63,14,618,2,1092,232,210,107,...,0,0,0,0,0,0,0,0,0,1
3,516307,3298,317,8,661,60,752,198,233,174,...,0,0,0,0,0,0,0,0,0,1
4,124860,3080,35,6,175,26,3705,219,227,144,...,0,0,0,0,0,0,0,0,0,1


In [320]:
test_data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,0
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,0
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,0
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,0
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,0


## 1. Feature Engineering & Selection

implemented so far (!!keep adding if you do stuff)

**Feature engineering**
++


**Feature Selection**
++


### Interaction Features

In [321]:
def create_interaction_features(df):
    """
    Create interaction features by combining existing features.

    Args:
        df (pd.DataFrame): DataFrame to enhance with interaction features.

    Returns:
        pd.DataFrame: DataFrame with new interaction features.
    """
    # Example interaction: Elevation and Hydrology features
    df['Elevation_plus_Vertical_Hydrology'] = df['Elevation'] + df['Vertical_Distance_To_Hydrology']
    df['Elevation_times_Horizontal_Hydrology'] = df['Elevation'] * df['Horizontal_Distance_To_Hydrology']

    # You can add more interactions based on domain knowledge and exploratory data analysis insights

    return df


### Polynomial Features

In [322]:
# ++ this fct does not seem to work when applied to train_data -> look into this
from sklearn.preprocessing import PolynomialFeatures

def add_polynomial_features(df, feature_cols, degree=2):
    """
    Adds polynomial features to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame to enhance with polynomial features.
        feature_cols (list): List of column names to which polynomial features will be applied.
        degree (int): The degree of the polynomial features.

    Returns:
        pd.DataFrame: DataFrame with polynomial features added.
    """
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    poly_features = poly.fit_transform(df[feature_cols])
    poly_feature_names = poly.get_feature_names(feature_cols)
    df_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
    
    # Drop the original features to avoid multicollinearity
    df.drop(feature_cols, axis=1, inplace=True)
    
    # Concatenate the original DataFrame with the new polynomial features
    df = pd.concat([df, df_poly], axis=1)
    
    return df


### Aggregate Features

In [323]:
def create_aggregate_features(df):
    """
    Create aggregate features that summarize information across multiple features.

    Args:
        df (pd.DataFrame): DataFrame to enhance with aggregate features.

    Returns:
        pd.DataFrame: DataFrame with new aggregate features.
    """
    # Example aggregate feature: Mean Hillshade
    df['Mean_Hillshade'] = df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].mean(axis=1)

    # More aggregates can be added based on exploratory data analysis and domain knowledge
    # ++

    return df


### Scale numerical columns

In [324]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def scale_numerical_columns(df):
    # ++ add a definition of what this does
    
    df_scaled = df.copy()
    
    # List of columns not to be normalized
    non_scaled_columns = ['Id', 'Cover_Type'] + [f'Soil_Type{i}' for i in range(1, 41)] + [f'Wilderness_Area{i}' for i in range(1, 5)]
    
    # List of columns to be normalized
    scaled_columns = [col for col in df_scaled.columns if col not in non_scaled_columns]
    
 
    scaler = StandardScaler()
    
    df_scaled[scaled_columns] = scaler.fit_transform(df_scaled[scaled_columns])
    
    return df_scaled

### Integrating the Feature Engineering Functions

In [None]:
# train data
train_data = create_interaction_features(train_data)
train_data = create_aggregate_features(train_data)
train_data = scale_numerical_columns(train_data)

# ++ this one does not work when applied to train_data (error)
# train_data = add_polynomial_features(train_data, ['Elevation', 'Aspect', 'Slope'], degree=2)

# test data
test_data = create_interaction_features(test_data)
test_data = create_aggregate_features(test_data)
test_data = scale_numerical_columns(test_data)

### Create euclidian distance features

In [370]:
# Calculate new features based on mean distances
# train
train_data['Mean_Elevation_Vertical_Distance_Hydrology'] = (train_data['Elevation'] + train_data['Vertical_Distance_To_Hydrology']) / 2
train_data['Mean_Distance_Hydrology_Firepoints'] = (train_data['Horizontal_Distance_To_Hydrology'] + train_data['Horizontal_Distance_To_Fire_Points']) / 2
train_data['Mean_Distance_Hydrology_Roadways'] = (train_data['Horizontal_Distance_To_Hydrology'] + train_data['Horizontal_Distance_To_Roadways']) / 2
train_data['Mean_Distance_Firepoints_Roadways'] = (train_data['Horizontal_Distance_To_Fire_Points'] + train_data['Horizontal_Distance_To_Roadways']) / 2

print(f"Shape of train after adding new features: {train_data.shape}")

# test
test_data['Mean_Elevation_Vertical_Distance_Hydrology'] = (test_data['Elevation'] + test_data['Vertical_Distance_To_Hydrology']) / 2
test_data['Mean_Distance_Hydrology_Firepoints'] = (test_data['Horizontal_Distance_To_Hydrology'] + test_data['Horizontal_Distance_To_Fire_Points']) / 2
test_data['Mean_Distance_Hydrology_Roadways'] = (test_data['Horizontal_Distance_To_Hydrology'] + test_data['Horizontal_Distance_To_Roadways']) / 2
test_data['Mean_Distance_Firepoints_Roadways'] = (test_data['Horizontal_Distance_To_Fire_Points'] + test_data['Horizontal_Distance_To_Roadways']) / 2

print(f"Shape of test after adding new features: {test_data.shape}")


Shape of train after adding new features: (15120, 60)
Shape of test after adding new features: (581012, 59)


### Remove low correlation coefficients

In [371]:
correlation_matrix = train_data.corr()  # Compute the correlation matrix

# Calculate the correlation of each feature with 'Cover_Type'
feature_correlation = correlation_matrix['Cover_Type'].drop(['Cover_Type', 'Id'])  # Exclude self-correlation and Id

# Decide on a threshold for low correlation (example: below 0.02 in absolute value)
low_correlation_features = feature_correlation[abs(feature_correlation) < 0.05].index.tolist()
print(f"Removed features with low correlation to 'Cover_Type': {low_correlation_features}")


# Drop these low correlation features from your dataset
# train
train = train_data.drop(low_correlation_features, axis=1)

print(f"Shape of train before removal: {train_data.shape}")
print(f"Shape of train after removal: {train.shape}")

# test
test = test_data.drop(low_correlation_features, axis=1)

print(f"Shape of test before removal: {test_data.shape}")
print(f"Shape of test after removal: {test.shape}")

Removed features with low correlation to 'Cover_Type': ['Elevation', 'Aspect', 'Horizontal_Distance_To_Hydrology', 'Hillshade_9am', 'Hillshade_3pm', 'Wilderness_Area2', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type11', 'Soil_Type13', 'Soil_Type14', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type30', 'Soil_Type34', 'Soil_Type36', 'Mean_Elevation_Vertical_Distance_Hydrology']
Shape of train before removal: (15120, 60)
Shape of train after removal: (15120, 28)
Shape of test before removal: (581012, 59)
Shape of test after removal: (581012, 27)


### Merging Soil Types

In [361]:
# Extract soil type columns
soil_columns = [col for col in train.columns if col.startswith('Soil_Type')]

# Combine the soil type columns into a single feature by identifying the active soil type
# train
train['Soil_Type_Combined'] = train[soil_columns].idxmax(axis=1).str.extract('(\d+)').astype(int)
train = train.drop(soil_columns, axis=1)

# test
test['Soil_Type_Combined'] = test[soil_columns].idxmax(axis=1).str.extract('(\d+)').astype(int)
test = test.drop(soil_columns, axis=1)


# 2. Prediction

In [362]:
# check what the train and test shapes looks like with feature engineering implemented
# train should have one more column (the Cover_Type) compared to test
print(train.shape) 
print(test.shape)


(15120, 14)
(581012, 13)


### Train the model(s)

Split the datasets for validation

In [372]:
# Prepare the data for the prediction
X = train.drop(['Id', 'Cover_Type'], axis=1)
y = train['Cover_Type']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Random Forest Classifier

In [373]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val)

# Evaluate the model
accuracy_rf = accuracy_score(y_val, y_pred)
print(f'Random Forest Accuracy: {accuracy_rf}')


Random Forest Accuracy: 0.826058201058201


Gradient Boosting Classifier

In [374]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Initialize the Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_gb = gb_model.predict(X_val)

# Evaluate the model
accuracy_gb = accuracy_score(y_val, y_pred_gb)
print(f'Gradient Boosting Classifier Accuracy: {accuracy_gb}')


Gradient Boosting Classifier Accuracy: 0.7040343915343915


XGBoost

In [332]:
# ++

SVM

In [333]:
# ++

KNN

In [334]:
# ++

### Create the CSV file

Your submission should be a CSV file with 581012 rows and a header. 

In [375]:
# Prepare the test data by removing the 'Id' column
X_test = test.drop(['Id'], axis=1)

# Predict on the test data
# !! change the model name according to what you are using
test_predictions = gb_model.predict(X_test)

# Create a DataFrame with 'Id' and 'Cover_Type' columns
prediction_df = pd.DataFrame({
    'Id': test['Id'],
    'Cover_Type': test_predictions
})

# Save the DataFrame to a CSV file
prediction_filename = 'test_predictions.csv'
prediction_df.to_csv(prediction_filename, index=False)

In [336]:
# run this for debugging: it should output (581012, 2). Otherwise the format is wrong.
prediction_df.shape

(581012, 2)

## Scorekeeping

To keep track of our Kaggle scores, every time you submit a new csv to kaggle, add the score + a short description of what we changed/added

Use Format: Score obtained on kaggle / Feature Engineering Steps Used / Model Used
!! keep in order of biggest to smallest scores

**Random Forest**
- 0.68 / Euclidian Dist, Low Corr(0.05), Merge Soil Types / Random Forest
- 0.50 / Interaction Fct, Aggregate Fct, Euclidian Dist, Low Corr(0.05), Merge Soil Types / Random Forest


**Gradient Boosting Classifier**
- 0.65 / Euclidian Dist, Low Corr(0.02) / GBC
- 0.56 / Euclidian Dist, Low Corr(0.05), Merge Soil Types / GBC
- 0.55 / Euclidian Dist, Low Corr(0.05) / GBC





# 3. Hyperparameter Tuning

In [337]:
# ++ to do for the following models
# Random Forest, XGBoost, Logistic Regression ...

# 4. Archives

Drag code here that might be useful later but that currently does not work/ cannot be used

In [338]:
## ++ gives very bad accuracy scores (around 0.3) for all the algos, so sth is wrong

# test model


from ast import literal_eval
from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score



def evaluate_models(X, y):
    models = {
        "LogReg": LogisticRegression(),
        "KNN": KNeighborsClassifier(),
        "SVM": SVC(),
        "DT": DecisionTreeClassifier(),
        "RF": RandomForestClassifier(),
        "ExtraTrees": ExtraTreesClassifier(),
        "XGB": XGBClassifier(),
        "Catboost": CatBoostClassifier(verbose=0),
        #"LightGBM": LGBMClassifier(),
    }
    
    results = {}
    
    for name, model in models.items():
        scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') 
        results[name] = scores
        print(f"{name}: Accuracy = {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    
    return results