In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/nyc_train.csv', nrows=50000)

# Remove data with extreme outlier coordinates or negative fares
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0'
                  )

y = data.fare_amount

base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude',
                 'passenger_count']

X = data[base_features]


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
first_model = RandomForestRegressor(n_estimators=50, random_state=1).fit(train_X, train_y)


In [2]:
train_X.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,23466.0,23466.0,23466.0,23466.0,23466.0
mean,-73.976827,40.756931,-73.975359,40.757434,1.66232
std,0.014625,0.018206,0.01593,0.018659,1.290729
min,-73.999999,40.700013,-73.999999,40.70002,0.0
25%,-73.987964,40.744901,-73.987143,40.745756,1.0
50%,-73.979629,40.758076,-73.978588,40.758542,1.0
75%,-73.967797,40.769602,-73.966459,40.770406,2.0
max,-73.900062,40.799952,-73.900062,40.799999,6.0


In [3]:
train_y.describe()

count    23466.000000
mean         8.472539
std          4.609747
min          0.010000
25%          5.500000
50%          7.500000
75%         10.100000
max        165.000000
Name: fare_amount, dtype: float64

In [4]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23466 entries, 37925 to 47506
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pickup_longitude   23466 non-null  float64
 1   pickup_latitude    23466 non-null  float64
 2   dropoff_longitude  23466 non-null  float64
 3   dropoff_latitude   23466 non-null  float64
 4   passenger_count    23466 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 1.1 MB


In [5]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(first_model, val_X, val_y, n_repeats=30, random_state=1)

In [6]:
import pandas as pd

importance_df = pd.DataFrame({
    'feature': val_X.columns,
    'importance': perm_importance.importances_mean
}).sort_values(by='importance', ascending=False)

print(importance_df)

             feature  importance
1    pickup_latitude    0.850480
3   dropoff_latitude    0.847474
0   pickup_longitude    0.611157
2  dropoff_longitude    0.528452
4    passenger_count   -0.000532


In [7]:
# create new features
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)

features_2  = ['pickup_longitude',
               'pickup_latitude',
               'dropoff_longitude',
               'dropoff_latitude',
               'abs_lat_change',
               'abs_lon_change']

X = data[features_2]
new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)


In [8]:
perm_importance = permutation_importance(second_model, new_val_X, new_val_y, n_repeats=30, random_state=1)

import pandas as pd

importance_df = pd.DataFrame({
    'feature': new_val_X.columns,
    'importance': perm_importance.importances_mean
}).sort_values(by='importance', ascending=False)

print(importance_df)


             feature  importance
4     abs_lat_change    0.577830
5     abs_lon_change    0.452734
1    pickup_latitude    0.087284
0   pickup_longitude    0.074984
3   dropoff_latitude    0.071412
2  dropoff_longitude    0.068985
