In [1]:
"""
Title: Wildfire Ignition Point Prediction

Team: 4-member school course project (Chang Hi Lee, Edward (Yu Sung) Kang, Frank Moon, Won Young Kang)

Function: A wildfire ignition point prediction model using the GradientBoostingRegressor method, that predicts the distance to the nearest ignited wildfire for each measured point.

Preprocessing: One-hot encoding for categorical features and normalization of Gaussian features.

Metric: Submissions are evaluated on Root-Mean-Square-Error(RMSE) between the predicted value and the observed distances.

Description: Natural resource managers responsible for developing ecosystem management strategies require basic descriptive information 
including inventory data for forested lands to support their decision-making processes.
However, managers generally do not have this type of data for inholdings or neighboring lands that are outside their immediate jurisdiction. 
One method of obtaining this information is through the use of predictive models.
"""

'\nTitle: Wildfire Ignition Point Prediction\n\nTeam: 4-member school course project (Chang Hi Lee, Edward (Yu Sung) Kang, Frank Moon, Won Young Kang)\n\nFunction: A wildfire ignition point prediction model using the GradientBoostingRegressor method, that predicts the distance to the nearest ignited wildfire for each measured point.\n\nPreprocessing: One-hot encoding for categorical features and normalization of Gaussian features.\n\nMetric: Submissions are evaluated on Root-Mean-Square-Error(RMSE) between the predicted value and the observed distances.\n\nDescription: Natural resource managers responsible for developing ecosystem management strategies require basic descriptive information \nincluding inventory data for forested lands to support their decision-making processes.\nHowever, managers generally do not have this type of data for inholdings or neighboring lands that are outside their immediate jurisdiction. \nOne method of obtaining this information is through the use of pred

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# Read the Data 
train = pd.read_csv('../input/wustl-cse517a-sp21-milestone1/train.csv')
test = pd.read_csv('../input/wustl-cse517a-sp21-milestone1/test.csv')

# Create target array for training
train_y = train.Horizontal_Distance_To_Fire_Points

# Preprocessing the data / Decoding Soil_Type
train['Climatic_Zone'] = train.apply(lambda row: int(str(row.Soil_Type)[0]), axis = 1) # extract the first digit
train['Geologic'] = train.apply(lambda row: int(str(row.Soil_Type)[1]), axis = 1) # extract the second digit 
test['Climatic_Zone'] = test.apply(lambda row: int(str(row.Soil_Type)[0]), axis = 1)
test['Geologic'] = test.apply(lambda row: int(str(row.Soil_Type)[1]), axis = 1)

# normalize data 
cols_to_normalize = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 
                     'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 
                     'Hillshade_Noon', 'Hillshade_3pm']
train[cols_to_normalize] = StandardScaler().fit_transform(train[cols_to_normalize])
test[cols_to_normalize] = StandardScaler().fit_transform(test[cols_to_normalize])

# One-hot encoding Climatic_Zone and Geologic 
Climatic_Zone_train = pd.get_dummies(train.Climatic_Zone,drop_first=True, prefix='CZ') # drop the first one to prevent multicolinearity
Climatic_Zone_test = pd.get_dummies(test.Climatic_Zone, drop_first=True, prefix='CZ')
Geologic_train = pd.get_dummies(train.Geologic, drop_first=True,prefix='GEO')
Geologic_test = pd.get_dummies(test.Geologic, drop_first=True, prefix='GEO')
hot_train = pd.concat([train, Climatic_Zone_train, Geologic_train], axis = 1)
hot_test = pd.concat([test, Climatic_Zone_test, Geologic_test], axis = 1)
hot_train = hot_train.drop(columns =['Soil_Type', 'Climatic_Zone', 'Geologic']) # remove used columns
hot_test = hot_test.drop(columns = ['Soil_Type', 'Climatic_Zone', 'Geologic'])

# Create training data 
predictor_cols = list(hot_train.columns)
predictor_cols = [e for e in predictor_cols if e not in ('Horizontal_Distance_To_Fire_Points', 'ID')]
train_X = hot_train[predictor_cols]

# Train the model using GDBoost
reg = GradientBoostingRegressor(random_state=0, n_estimators=300, loss='ls')
reg.fit(train_X, train_y)

# Make Predictions
test_X = hot_test[predictor_cols]
predicted_distances = reg.predict(test_X)

# Preparing submission file
my_submission = pd.DataFrame({'Id': test.ID, 'Horizontal_Distance_To_Fire_Points': predicted_distances})
my_submission.to_csv('submission.csv', index=False)

# Calculate in-sample error
RMSE2 = mean_squared_error(reg.predict(train_X), train_y, squared=False)
print('RMSE ', RMSE2)

/kaggle/input/wustl-cse517a-sp21-milestone1/sample_submission.csv
/kaggle/input/wustl-cse517a-sp21-milestone1/train.csv
/kaggle/input/wustl-cse517a-sp21-milestone1/test.csv
RMSE  564.6284733509643
