# Bike Rent Count Prediction

#### Problem Statement​ -
You are a cab rental start-up company. You have successfully run the pilot project and
now want to launch your cab service across the country. You have collected the
historical data from your pilot project and now have a requirement to apply analytics for
fare prediction. You need to design a system that predicts the fare amount for a cab ride
in the city.

#### Number of attributes:
- pickup_datetime - timestamp value indicating when the cab ride started.
- pickup_longitude - float for longitude coordinate of where the cab ride started.
- pickup_latitude - float for latitude coordinate of where the cab ride started.
- dropoff_longitude - float for longitude coordinate of where the cab ride ended.
- dropoff_latitude - float for latitude coordinate of where the cab ride ended.
- passenger_count - an integer indicating the number of passengers in the cab
  ride.
  
#### Solution​ -
The problem is of regression type so we will be building a regression model after performing required preprocessing steps on given dataset.

In [1]:
#Load libraries
import os
import pandas as pd
import pandas_profiling
import numpy as np
from fancyimpute import KNN   
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import seaborn as sns
from random import randrange, uniform

Using TensorFlow backend.


In [3]:
# Set the directory
os.chdir("D:\Google Cloud Storage\Python Work Bench")

# View untruncated panda dataframe
pd.set_option('display.max_columns', 30)

# View untruncated numpy array
np.set_printoptions(threshold = np.inf)

In [156]:
df = pd.read_csv('train_cab.csv')
# df.profile_report()
df.head(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1.0
5,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1.0
6,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1.0
7,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1.0
8,,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1.0
9,8.9,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.99154,40.758138,2.0


In [157]:
# print(df.dtypes)
# print(df.describe())
# print(df.describe(include = ['O']))
# print(df.isnull().sum())

# # method 1: Displaying observations where latitude and longitude equal 0
# # f1 = df['pickup_latitude'] == 0
# # f2 = df['pickup_longitude'] == 0
# # f3 = df['dropoff_longitude'] == 0
# # f4 = df['dropoff_latitude'] == 0
# # df.where(f1 | f2 | f3 | f4)

# # method 2: Displaying observations where latitude and longitude equal 0
# f1 = df['pickup_latitude'] == 0
# f2 = df['pickup_longitude'] == 0
# f3 = df['dropoff_longitude'] == 0
# f4 = df['dropoff_latitude'] == 0
# df1 = df.loc[f1 | f2 | f3 | f4]
# print(df.shape)
# print(df1.shape)
# print((326/16067) * 100)

In [158]:
df.dtypes

fare_amount           object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

# Exploratory Data Analysis

In [159]:
# Removing 'UTC' from datetime
df['pickup_datetime'] = df['pickup_datetime'].map(lambda x: x.rstrip(' UTC'))

# Converting data type of 'pickup_datetime' from object to date time 
df['date_time'] = pd.to_datetime(df['pickup_datetime'], utc = True, errors = 'coerce')

# Dropping the 'pickup_datetime' column as new column 'date_time' has been added
df.drop('pickup_datetime', axis = 1, inplace = True)

# Converting data type of 'fare_amount' to numeric
df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors = 'coerce')

# Calculating percentage of missing value in dependent variable
print((df['fare_amount'].isnull().sum() / len(df) * 100))

# Dropping the rows with missing value in dependent variable since it is negligible
df.dropna(subset = ['fare_amount'], inplace = True)

# Dropping the observations where fare amount is less than or equal to zero as it doesn't make sense
# df[df['fare_amount'] <= 0]
df.drop(df[df['fare_amount'] <= 0].index, inplace = True)

df.reset_index(inplace = True, drop = True)

df.head(10)

0.15559843156780978


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date_time
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0,2009-06-15 17:26:21+00:00
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0,2010-01-05 16:52:16+00:00
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0,2011-08-18 00:35:00+00:00
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0,2012-04-21 04:30:42+00:00
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0,2010-03-09 07:51:00+00:00
5,12.1,-74.000964,40.73163,-73.972892,40.758233,1.0,2011-01-06 09:50:45+00:00
6,7.5,-73.980002,40.751662,-73.973802,40.764842,1.0,2012-11-20 20:35:00+00:00
7,16.5,-73.9513,40.774138,-73.990095,40.751048,1.0,2012-01-04 17:22:00+00:00
8,8.9,-73.980658,40.733873,-73.99154,40.758138,2.0,2009-09-02 01:11:00+00:00
9,5.3,-73.996335,40.737142,-73.980721,40.733559,1.0,2012-04-08 07:30:50+00:00


In [160]:
df.dtypes

fare_amount                      float64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                  float64
date_time            datetime64[ns, UTC]
dtype: object

# Feature Engineering

In [161]:
#Extracting day of week and hour of day features from date time
df['weekday'] = df.date_time.dt.weekday_name
df['hour_of_day'] = df.date_time.dt.hour

# Changing hour_of_day to object type
df['hour_of_day'] = df['hour_of_day'].astype('O')

# Dropping date_time feature
df.drop('date_time', axis = 1, inplace = True)

df.head()
df.dtypes

fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
weekday               object
hour_of_day           object
dtype: object

In [162]:
# Derive distance feature from latitude & longitude pairs

# vectorized haversine function
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

p_lat = df['pickup_latitude']
p_long = df['pickup_longitude']
d_lat = df['dropoff_latitude']
d_long = df['dropoff_longitude']

df['dist'] = haversine(p_lat, p_long, d_lat, d_long)

# Dropping latitude and longitude features
df.drop(['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'], axis = 1, inplace = True)

# Rearranging colunms sequence
df = df[['dist', 'passenger_count', 'weekday', 'hour_of_day', 'fare_amount']]
df.head()
df.dtypes

dist               float64
passenger_count    float64
weekday             object
hour_of_day         object
fare_amount        float64
dtype: object

In [164]:
# Rounding off passenger count as it should be a whole number
df['passenger_count'] = df['passenger_count'].round()

# Dropping the observations where passenger count is greater than 6 as it is pretty impractical hence that data is not correct
df.drop(df[(df['passenger_count'] > 6)].index, inplace = True)

df.reset_index(inplace = True, drop = True)

df.head()
df.dtypes

dist               float64
passenger_count    float64
weekday             object
hour_of_day         object
fare_amount        float64
dtype: object

In [190]:
df['passenger_count'].unique()

array([ 1.,  2.,  3., nan,  6.,  5.,  4.])

In [166]:
# Dropping the observations where both distance and passenger count is 0 as imputing these values will inaccurate results
# print(df[(df['passenger_count'] < 1) & (df['dist'] == 0)].count())
df.drop(df[(df['passenger_count'] < 1) & (df['dist'] == 0)].index, inplace = True)
df.reset_index(inplace = True, drop = True)
df.dtypes

dist               float64
passenger_count    float64
weekday             object
hour_of_day         object
fare_amount        float64
dtype: object

In [33]:
# Converting dtype of passenger count to object
# df['passenger_count'] = df['passenger_count'].astype('O')

In [189]:
# Replacing all remaining zeroes in dist and passenger_count with nan which will be imputed later
df['dist'].replace(0, np.nan, inplace = True)
df['passenger_count'].replace(0, np.nan, inplace = True)

df.isnull().sum()

dist               454
passenger_count    112
weekday              1
hour_of_day          1
fare_amount          0
dtype: int64

In [191]:
df.dtypes

dist               float64
passenger_count    float64
weekday             object
hour_of_day         object
fare_amount        float64
dtype: object

In [15]:
# print(df.dtypes)
# print(df.describe())
# print(df.describe(include = ['O']))
# df['weekday'].value_counts()
# print(df.isnull().sum())

# Visualization

In [16]:
# # Visualise distribution of data in features

# x = df[df.fare_amount<100]
# for c in x.iloc[:,0:5]:
#     df.hist(c)

# Univariate Analysis - Outliers Detection

In [95]:
# # Make copy of data
# df_cpy = df.copy()
# df = df_cpy.copy()

In [96]:
# # Plot boxplot to visualise outliers
# %matplotlib inline
# plt.boxplot(df['dist'])

In [192]:
# Defining numerical and object columns
cols_num = ['fare_amount', 'dist']
cols_obj = ['weekday', 'hour_of_day', 'passenger_count']
df.dtypes

dist               float64
passenger_count    float64
weekday             object
hour_of_day         object
fare_amount        float64
dtype: object

In [38]:
# df_cpy = df.copy()
# df = df_cpy.copy()
# df.head(10)

Unnamed: 0,dist,passenger_count,weekday,hour_of_day,fare_amount
0,1.030764,1.0,Monday,17,4.5
1,8.450134,1.0,Tuesday,16,16.9
2,1.389525,2.0,Thursday,0,5.7
3,2.79927,1.0,Saturday,4,7.7
4,1.999157,1.0,Tuesday,7,5.3
5,3.787239,1.0,Thursday,9,12.1
6,1.555807,1.0,Tuesday,20,7.5
7,4.155444,1.0,Wednesday,17,16.5
8,2.849627,2.0,Wednesday,1,8.9
9,1.374577,1.0,Sunday,7,5.3


In [402]:
df.dtypes

dist               float64
passenger_count    float64
weekday             object
hour_of_day         object
fare_amount        float64
dtype: object

In [193]:
# Detect and replace with NA
for i in cols_num:
    # Extract quartiles
    q75, q25 = np.nanpercentile(df[i], [75 ,25])
    
#     print(i)
#     print('q75:', q75)
#     print('q25:', q25)

    # Calculate IQR
    iqr = q75 - q25

    # Calculate inner and outer fence
    minimum = q25 - (iqr*1.5)
    maximum = q75 + (iqr*1.5)

#     print('min:',minimum)
#     print('max:',maximum)

    # Replace with NA
    df.loc[df[i] < minimum, i] = np.nan
    df.loc[df[i] > maximum, i] = np.nan

    # Calculate missing value
#     missing_val = pd.DataFrame(df.isnull().sum())

    # # Impute with KNN
    # marketing_train = pd.DataFrame(KNN(k = 3).complete(marketing_train), columns = marketing_train.columns)

df.head()

Unnamed: 0,dist,passenger_count,weekday,hour_of_day,fare_amount
0,1.030764,1.0,Monday,17,4.5
1,,1.0,Tuesday,16,16.9
2,1.389525,2.0,Thursday,0,5.7
3,2.79927,1.0,Saturday,4,7.7
4,1.999157,1.0,Tuesday,7,5.3


In [194]:
# Dropping all observations where fare amount is nan after outlier detection
df.dropna(subset = ['fare_amount'], inplace = True)
df.reset_index(inplace = True, drop = True)

df.isnull().sum()

dist               680
passenger_count    106
weekday              1
hour_of_day          1
fare_amount          0
dtype: int64

# Dummify

In [195]:
# df_cpy1 = df.copy()
# df = df_cpy1.copy()
# df.head(10)

Unnamed: 0,dist,passenger_count,weekday,hour_of_day,fare_amount
0,1.030764,1.0,Monday,17,4.5
1,,1.0,Tuesday,16,16.9
2,1.389525,2.0,Thursday,0,5.7
3,2.79927,1.0,Saturday,4,7.7
4,1.999157,1.0,Tuesday,7,5.3
5,3.787239,1.0,Thursday,9,12.1
6,1.555807,1.0,Tuesday,20,7.5
7,4.155444,1.0,Wednesday,17,16.5
8,2.849627,2.0,Wednesday,1,8.9
9,1.374577,1.0,Sunday,7,5.3


In [196]:
# Get dummy variables for categorical variables
df_dummy = pd.get_dummies(df['weekday'], drop_first = True)
df_dummy.head()

Unnamed: 0,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,1,0,0,0,0,0
1,0,0,0,0,1,0
2,0,0,0,1,0,0
3,0,1,0,0,0,0
4,0,0,0,0,1,0


In [197]:
# Concatenate the two data sets
df = pd.concat([df_dummy, df], axis=1)
df.head()

Unnamed: 0,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,dist,passenger_count,weekday,hour_of_day,fare_amount
0,1,0,0,0,0,0,1.030764,1.0,Monday,17,4.5
1,0,0,0,0,1,0,,1.0,Tuesday,16,16.9
2,0,0,0,1,0,0,1.389525,2.0,Thursday,0,5.7
3,0,1,0,0,0,0,2.79927,1.0,Saturday,4,7.7
4,0,0,0,0,1,0,1.999157,1.0,Tuesday,7,5.3


In [198]:
# Drop the weekday feature
df.drop('weekday', axis = 1, inplace = True)
df.drop('hour_of_day', axis = 1, inplace = True)
df.head()

Unnamed: 0,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,dist,passenger_count,fare_amount
0,1,0,0,0,0,0,1.030764,1.0,4.5
1,0,0,0,0,1,0,,1.0,16.9
2,0,0,0,1,0,0,1.389525,2.0,5.7
3,0,1,0,0,0,0,2.79927,1.0,7.7
4,0,0,0,0,1,0,1.999157,1.0,5.3


# Feature Scaling

In [199]:
# df_cpy1 = df.copy()
# df = df_cpy1.copy()
# df.head(10)

Unnamed: 0,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,dist,passenger_count,fare_amount
0,1,0,0,0,0,0,1.030764,1.0,4.5
1,0,0,0,0,1,0,,1.0,16.9
2,0,0,0,1,0,0,1.389525,2.0,5.7
3,0,1,0,0,0,0,2.79927,1.0,7.7
4,0,0,0,0,1,0,1.999157,1.0,5.3
5,0,0,0,1,0,0,3.787239,1.0,12.1
6,0,0,0,0,1,0,1.555807,1.0,7.5
7,0,0,0,0,0,1,4.155444,1.0,16.5
8,0,0,0,0,0,1,2.849627,2.0,8.9
9,0,0,1,0,0,0,1.374577,1.0,5.3


In [200]:
df.dtypes

Monday               uint8
Saturday             uint8
Sunday               uint8
Thursday             uint8
Tuesday              uint8
Wednesday            uint8
dist               float64
passenger_count    float64
fare_amount        float64
dtype: object

In [286]:
# #Normality check
# %matplotlib inline
# #plt.hist(market['custAge'], bins = 'auto')
# # plt.hist(market['campaign'], bins = 'auto')

In [201]:
#Normalization
from sklearn.preprocessing import MinMaxScaler
X = df.iloc[:,:].copy()
scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X.iloc[:,:] = scaled
# unscaled = scaler.inverse_transform(scaled)

print(X.isnull().sum())
X.head()

Monday               0
Saturday             0
Sunday               0
Thursday             0
Tuesday              0
Wednesday            0
dist               680
passenger_count    106
fare_amount          0
dtype: int64


Unnamed: 0,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,dist,passenger_count,fare_amount
0,1.0,0.0,0.0,0.0,0.0,0.0,0.129637,0.0,0.203259
1,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,0.764599
2,0.0,0.0,0.0,1.0,0.0,0.0,0.174762,0.2,0.257583
3,0.0,1.0,0.0,0.0,0.0,0.0,0.352082,0.0,0.348121
4,0.0,0.0,0.0,0.0,1.0,0.0,0.251442,0.0,0.239475


In [202]:
X.dtypes

Monday             float64
Saturday           float64
Sunday             float64
Thursday           float64
Tuesday            float64
Wednesday          float64
dist               float64
passenger_count    float64
fare_amount        float64
dtype: object

In [181]:
# #KNN imputation
# #Assigning levels to the categories
# lis = []
# for i in range(X.shape[1]):
#     #print(i)
#     if(X.iloc[:,i].dtypes == 'object'):
#         X.iloc[:,i] = pd.Categorical(X.iloc[:,i])
# #         print('1',X.isnull().sum())
#         #print(market.iloc[[1]])--check its use
#         X.iloc[:,i] = X.iloc[:,i].cat.codes
#         print('2',X.isnull().sum())
#         X.iloc[:,i] = X.iloc[:,i].astype('object')
# #         print('3',X.isnull().sum())
       
#         lis.append(X.columns[i])
        
# print(X.isnull().sum())
# X.head()

In [203]:
#Apply KNN imputation algorithm
X = pd.DataFrame(KNN(k = 3).fit_transform(X), columns = X.columns);

Imputing row 1/14621 with 0 missing, elapsed time: 27.227
Imputing row 101/14621 with 0 missing, elapsed time: 27.228
Imputing row 201/14621 with 0 missing, elapsed time: 27.229
Imputing row 301/14621 with 0 missing, elapsed time: 27.230
Imputing row 401/14621 with 0 missing, elapsed time: 27.230
Imputing row 501/14621 with 1 missing, elapsed time: 27.231
Imputing row 601/14621 with 0 missing, elapsed time: 27.232
Imputing row 701/14621 with 0 missing, elapsed time: 27.233
Imputing row 801/14621 with 0 missing, elapsed time: 27.234
Imputing row 901/14621 with 0 missing, elapsed time: 27.235
Imputing row 1001/14621 with 0 missing, elapsed time: 27.235
Imputing row 1101/14621 with 0 missing, elapsed time: 27.236
Imputing row 1201/14621 with 1 missing, elapsed time: 27.236
Imputing row 1301/14621 with 0 missing, elapsed time: 27.237
Imputing row 1401/14621 with 0 missing, elapsed time: 27.238
Imputing row 1501/14621 with 1 missing, elapsed time: 27.239
Imputing row 1601/14621 with 0 missi

In [204]:
X.isnull().sum()

Monday             0
Saturday           0
Sunday             0
Thursday           0
Tuesday            0
Wednesday          0
dist               0
passenger_count    0
fare_amount        0
dtype: int64

In [205]:
df.isnull().sum()

Monday               0
Saturday             0
Sunday               0
Thursday             0
Tuesday              0
Wednesday            0
dist               680
passenger_count    106
fare_amount          0
dtype: int64

In [206]:
# Unscaling the dataset
scaled = X.iloc[:,:]
unscaled = scaler.inverse_transform(scaled)
X.iloc[:,:] = unscaled

In [54]:
# # To check if X equals df
# sum = 0
# for i in range(len(df)):
#     if (X.loc[i,'fare_amount'] != df.loc[i,'fare_amount']):
#         print(i)
#         print(X.loc[i,'fare_amount'])
#         print(df.loc[i,'fare_amount'])
# print('exited')

In [207]:
df.iloc[:,:-1] = X.iloc[:,:-1].copy()
df.isnull().sum()

#Save result
df.to_csv('CabFare_noMiss.csv', index = False)

# Sampling

In [23]:
df = pd.read_csv('CabFare_noMiss.csv')

In [7]:
# df[df['passenger_count'] == 0]
# df.dtypes

In [24]:
# Split the train and test data
from sklearn.model_selection import train_test_split as tts
X = df.values[:, :-1]
Y = df.values[:, 8]

# Scale the features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2, random_state = 0)

In [25]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,14621.0,14621.0,14621.0,14621.0,14621.0,14621.0,14621.0,14621.0
mean,0.130976,0.153888,0.132686,0.144313,0.142945,0.146638,0.308461,0.129135
std,0.337386,0.360854,0.339247,0.351419,0.350029,0.353757,0.206351,0.252776
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.152605,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.250922,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.412615,0.2
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# 10-fold cross-validation with all three features
lm = LinearRegression()
scores = cross_val_score(lm, X_train, Y_train, cv=10, scoring='neg_mean_squared_error')
print(scores)

[-4.93019213 -5.80829571 -5.16201658 -5.76050892 -4.92554502 -5.19538718
 -4.8784798  -5.07406115 -6.04362322 -5.03696471]


In [None]:
# # create a Python list of three feature names
# feature_cols = ['TV', 'Radio', 'Newspaper']

# # use the list to select a subset of the DataFrame (X)
# X = data[feature_cols]

# # select the Sales column as the response (y)
# y = data.Sales

In [27]:
# fix the sign of MSE scores
mse_scores = -scores
print(mse_scores)

[4.93019213 5.80829571 5.16201658 5.76050892 4.92554502 5.19538718
 4.8784798  5.07406115 6.04362322 5.03696471]


In [28]:
# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)

[2.2204036  2.4100406  2.27200717 2.40010602 2.21935689 2.2793392
 2.2087281  2.25256768 2.45837817 2.24431832]


In [29]:
# calculate the average RMSE
print(rmse_scores.mean())

2.2965245746339695


In [31]:
lm.fit(X_train, Y_train)
Y_pred = lm.predict(X_test)

from sklearn.metrics import mean_squared_error
mean = mean_squared_error(Y_test, Y_pred) 
rmse = np.sqrt(mean)
rmse

2.349224596148728

In [None]:
# # 10-fold cross-validation with two features (excluding Newspaper)
# feature_cols = ['TV', 'Radio']
# X = data[feature_cols]
# print(np.sqrt(-cross_val_score(lm, X, y, cv=10, scoring='neg_mean_squared_error')).mean())

In [32]:
# Random forest
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators = 100, random_state = 0)
scores = cross_val_score(RF, X_train, Y_train, cv = 10, scoring = 'neg_mean_squared_error')
print(scores)

[-6.02851785 -6.82801786 -6.12441946 -7.06041804 -6.3829852  -6.40703049
 -6.16950498 -6.22594124 -7.48030395 -6.16554458]


In [34]:
# fix the sign of MSE scores
mse_scores = -scores
print(mse_scores)

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)

# calculate the average RMSE
print(rmse_scores.mean())

[6.02851785 6.82801786 6.12441946 7.06041804 6.3829852  6.40703049
 6.16950498 6.22594124 7.48030395 6.16554458]
[2.45530402 2.61304762 2.47475644 2.65714472 2.52645705 2.53121127
 2.48384882 2.49518361 2.73501443 2.48305147]
2.5455019447873957


In [35]:
RF.fit(X_train, Y_train)
Y_pred = RF.predict(X_test)

from sklearn.metrics import mean_squared_error
mean = mean_squared_error(Y_test, Y_pred) 
rmse = np.sqrt(mean)
rmse

2.5413840073193015

In [36]:
# Boosting
import xgboost as xgb
xg_reg = xgb.XGBRegressor()
scores = cross_val_score(xg_reg, X_train, Y_train, cv = 10, scoring = 'neg_mean_squared_error')
print(scores)

[-4.85134998 -5.63242057 -5.06145249 -5.71886136 -5.04381639 -5.26250932
 -4.81349648 -5.11038739 -6.00524756 -5.03180985]


In [37]:
# fix the sign of MSE scores
mse_scores = -scores
print(mse_scores)

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)

# calculate the average RMSE
print(rmse_scores.mean())

[4.85134998 5.63242057 5.06145249 5.71886136 5.04381639 5.26250932
 4.81349648 5.11038739 6.00524756 5.03180985]
[2.20257803 2.37327212 2.24976721 2.39141409 2.24584425 2.29401598
 2.1939682  2.26061659 2.45056066 2.2431696 ]
2.290520674357049


In [38]:
xg_reg.fit(X_train, Y_train)
Y_pred = xg_reg.predict(X_test)

from sklearn.metrics import mean_squared_error
mean = mean_squared_error(Y_test, Y_pred) 
rmse = np.sqrt(mean)
rmse



2.3541871486767088