In [72]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Modules

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')

from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score


#supervised learning models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


# Reading data

In [74]:
df = pd.read_csv('/kaggle/input/dsny-20152017/311-DSNY-20151017.csv')

In [75]:
df.info()

# Data Cleaning

Considering only the non-null values in closed date and also with status code as closed, pending as these records have closed date updated

In [76]:
df = df[(df['Status']=='Closed') | (df['Status']=='Pending')]
df.dropna(subset=['Closed Date'], inplace=True)


In [77]:
df.shape

 Convert the data types to correct ones

In [78]:
df['Created Date'] = pd.to_datetime(df['Created Date'],errors='coerce')
df['Closed Date'] = pd.to_datetime(df['Closed Date'],errors='coerce')


Using this copy of dataframe for creating data visualizations on whole data without any data preprocessing

In [79]:
df2 = df.copy()
df2['Created Month'] = df2['Created Date'].dt.month
df2['Created Year'] = df2['Created Date'].dt.year
df2['Created Day'] = df2['Created Date'].dt.day
df2['Closed Month'] = df2['Closed Date'].dt.month
df2['Closed Day'] = df2['Closed Date'].dt.day
df2['Closed Year'] = df2['Closed Date'].dt.year
df2['Days'] = (df2['Closed Date'] - df2['Created Date'])/pd.Timedelta(days=1)

## Handling missing values

In [80]:
df.isnull().sum()

Remove columns with all missing values

In [81]:
df = df.drop(['Intersection Street 1','Intersection Street 2','Due Date','Landmark','Vehicle Type','Taxi Company Borough','Taxi Pick Up Location','Bridge Highway Name','Bridge Highway Direction','Road Ramp','Bridge Highway Segment' ], axis=1)

Removing columns which are not considered in the analysis due to the following reasons:
1. only few unique values
2. most of the values are distinct
3. duplicate information

In [82]:
df = df.drop(['Incident Address','Street Name','Cross Street 1','Cross Street 2','Resolution Description','Incident Zip','X Coordinate (State Plane)','Y Coordinate (State Plane)','Latitude','Longitude', 'Park Facility Name',
              'Agency','Descriptor','Facility Type','Resolution Action Updated Date','Location','City','Unique Key', 'Park Borough', 'Agency Name','Community Board','BBL','Status'], axis=1)

Dropping the null values of these features as these records are less than 10% of the total data

In [83]:
df = df.dropna(subset=['Borough','Address Type','Location Type'])


In [84]:
df.info()

## Convert Categorical variables to numerical using dummy columns

In [85]:
df['Complaint Type'].value_counts()

This shows the sanitation materials, dirty materials and missed collection of materials are the most common complaints in New york

In [86]:
df['Complaint Type'].value_counts()[:10].plot(kind='barh',figsize=(18, 10))
plt.show()

This graph infers that the sidewalk and street are the most common location types related to the complaints 

In [87]:
df['Location Type'].value_counts().plot(kind='barh',figsize=(18, 10))
plt.show()

Open Data Channel Type data has high number of 'phone' records and very few records have OTHER type

In [88]:
df['Open Data Channel Type'].value_counts()

Address Type feature has 'Address' as the most frequent value

In [89]:
df['Address Type'].value_counts()

Most of the complaints are from Brooklyn borough and comparatively less complaints from Manhattan

In [90]:
df = df.loc[df['Borough'] != 'Unspecified']
df['Borough'].value_counts()

Finally, creating dummy columns for these variables - Location Type, Address Type, Borough, Open Data Channel Type, Complaint Type

In [91]:
df= pd.get_dummies(data=df, columns=['Location Type','Address Type','Borough','Open Data Channel Type','Complaint Type'])

In [92]:
df.info()

To remove the multi collinearity between the new dummy features, deleting one of the dummy variables for each category.

In [93]:
df = df.drop(['Open Data Channel Type_UNKNOWN','Address Type_INTERSECTION','Location Type_Other','Borough_STATEN ISLAND','Complaint Type_Foam Ban Enforcement'], axis=1)

The variance of all the features are almost similar

In [94]:
df.var()

Removing the duplicate records

In [95]:
df = df.drop_duplicates()

In [96]:
df.info()

# Feature Engineering

Creating new features like month, year columns for the created data, closed date. This will be used for the modeling and data visualization

In [97]:
df['Created Month'] = df['Created Date'].dt.month
df['Created Year'] = df['Created Date'].dt.year
df['Closed Year'] = df['Closed Date'].dt.year
df.head()

Descriptive statistics of the created and closed date

In [98]:
df[['Created Date', 'Closed Date']].describe(datetime_is_numeric=True)

No outliers in the created year column

In [99]:
sns.boxplot(y=df['Created Year'])

Few outliers are present whose values are greater than 2019.

In [100]:
sns.boxplot(y=df['Closed Year'])

Removed outliers in closed date feature using inter quartile range

In [101]:
Q1 = df['Closed Date'].quantile(0.25) #lower 10%
Q3 = df['Closed Date'].quantile(0.75) #higher 10%
IQR = Q3 - Q1
lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR
print(lower_range, upper_range)
#remove outliers
df = df[(df['Closed Date'] >= lower_range) & (df['Closed Date'] <= upper_range)]

Now, the closed year has no outliers

In [102]:
sns.boxplot(y=df['Closed Year'])

In [103]:
df.describe()

These are the final features used for the data modeling

In [104]:
df.info()

# Exploratory Data Analysis

In [105]:
df2.info()

## 311 calls in New york

As we see in this geomap, the complaints are coming from almost everywhere in new york.

In [106]:
sns.set(rc = {'figure.figsize':(25,15)})
tmp = sns.scatterplot(data=df2, x='Longitude', y='Latitude', hue='Borough')
tmp.set_title('New york 311 calls')
tmp.set_ylabel('Latitude')
tmp.set_xlabel('Longitude')
plt.show()

## Distribution of 311 calls in various cities

Brooklyn , staten island, new york and bronx cities reported high number of compliants

In [175]:
df2['City'].value_counts()[:10].plot(kind='barh',figsize=(18, 10), title='Distribution of calls in cities')
plt.xlabel('city')
plt.ylabel('Number of complaints')
plt.show()

## Correlation Matrix

This shows BBL is highly negatively correlated with latitude
 and created month, closed month are positvely correlated. 

In [108]:
tmp = df2.loc[:,['Incident Zip','BBL','Created Year','Created Month','Days','Latitude','Longitude','Closed Year','Closed Month']]
sns.heatmap(tmp.corr())

## Spread of complaints in various community boards

Staten island, queens and brooklyn related community boards have high complaints

In [167]:
df2['Community Board'].value_counts().plot(kind='bar',figsize=(18, 15), title='Distribution of calls in various community boards', xlabel='Community Board', ylabel='Total Number of complaints')
plt.show()

## Number of complaints received in each year per address type

Complaints are high in 'Address' type and those are increasing over the years.



In [172]:
plt.subplots(figsize=(20,10))
sns.countplot(x = 'Address Type', data = df2, hue ='Created Year')
plt.title('Number of complaints per address type in each year')
plt.ylabel('Number of complaints')
plt.show()

## Complaints closed over the years per complaint type

More number of complaints are closed in 2017 and very less in 2018 and 2019. Each year, the Sanitation condition, missed collection, dirty conditions and derelict vehicles are resolved in high numbers.

Request Large Bulky item collection is resolved more in 2017.

In 2018, most of the complaints fixed are of Graffiti type.

Electronic waste related complaints are resolved in 2017, 2018.

Overall, total number of closed complaints are in increasing trend till 2017. This indicates an improvement in their performance.


In [176]:
p1 = df2[df2['Complaint Type'].isin(['Sanitation Condition','Dirty Conditions','Missed Collection (All Materials)','Derelict Vehicles','Request Large Bulky Item Collection','Graffiti','Snow','Other Enforcement','Electronics Waste','Litter Basket / Request'])]
p1 = p1.groupby(['Closed Year','Complaint Type']).size().unstack()
p1.plot(kind='bar', stacked=True, figsize=[20,15], ylabel='Total cases', title = 'Total cases resolved each year')
plt.show()

# Receiving complaints trend in a month




This graph shows the total calls received on each day of a month
From this graph, we can infer that during mid of the month, the number of complaints are comparatively high
and then they decreased till end of the month

In [177]:
df2.groupby('Created Day').size().plot()
plt.title('Receiving complaints trend in a month')
plt.ylabel('Number of complaints')
plt.show()

# Target Variable Creation

As there are instances where the resolution of a complaint takes more than 3months to almost an year. To have more interpretability, days is a better choice for target variable
Moreover, the closed Date values doesn't have a timestamp. So, the actual data is not expecting for an exact hour of resolution. Hence, I considered days as the target variable.

Created two columns  - total days, hours taken to resolve a complaint

In [117]:
df['Estimated_Days'] = (df['Closed Date'] - df['Created Date'])/pd.Timedelta(days=1)


Estimated Days Box plot

In [118]:
sns.boxplot(df['Estimated_Days'])

In [119]:
plt.hist(df['Estimated_Days'])
plt.title("Data looks skewed")
plt.show()

From the above graphs, we can identify that the data is skewed and also there are negative values in the days column. To handle this, the following steps are performed:
1. removed rows with days less than zero
2. The average number of days to resolve a GRAFFITI complaint type is very high when compared to other complaint types. Due to this specific category value, the days column is skewed to the right. This will have an impact on the predictions. To overcome this, days with more than avg number of days for Graffiti are removed from the final dataset.

Removing rows with closed date is before the created date

In [120]:
df = df.loc[df['Estimated_Days'] > 0]

In [121]:
df['Estimated_Days'].describe()

The graffiti complaints take a long time when compared to all other complaint type.

In [122]:
print(df2.loc[df2['Complaint Type'] != 'Graffiti']['Days'].mean()) # avg days required to close for all complaint types except graffiti
print(df2.loc[df2['Complaint Type'] == 'Graffiti']['Days'].mean()) # only for graffiti

keep the days with max value as 100

In [123]:
df = df.loc[df['Estimated_Days'] <= 100]

In [124]:
df.info()

# Data Modeling

In [125]:
# Data splitting
X = df.drop(['Estimated_Days','Created Date', 'Closed Date', 'Closed Year'], axis=1) #drop the target
y = df['Estimated_Days'] #target variable
seed = 42 

# split!
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    shuffle=True,
                                                    random_state = seed)

In [126]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model 1 - Linear Regression

In [127]:
## Model 1

Model1R = LinearRegression()
Model1R = Model1R.fit(X_train, y_train) 

# store the predictions
train_preds_1R = Model1R.predict(X_train) 
test_preds_1R = Model1R.predict(X_test)

Error Metrics

In [128]:
# Train
print('Mean Absolute Error: ', mean_absolute_error(y_train,train_preds_1R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_train,train_preds_1R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_train,train_preds_1R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_train,train_preds_1R))  # Calculate MAPE
print('R2: ', r2_score(y_train,train_preds_1R))  # Calculate R2
print('Bias: ', np.mean(y_train - train_preds_1R))

In [129]:
# Test
print('Mean Absolute Error: ', mean_absolute_error(y_test,test_preds_1R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_test,test_preds_1R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_test,test_preds_1R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_test,test_preds_1R))  # Calculate MAPE
print('R2: ', r2_score(y_test,test_preds_1R))  # Calculate R2
print('Bias: ', np.mean(y_test - test_preds_1R))

In [130]:
fig, axes = plt.subplots(1,2) 
fig.suptitle("Model 1R Results", fontsize=20)

axes[0].scatter(x=y_train, y=train_preds_1R) 
axes[0].set_xlabel("Actual", fontsize=10)
axes[0].set_ylabel("Predicted (TRAIN)",  fontsize=10)
# set plot limits
axes[0].set_xlim(0,100)
axes[0].set_ylim(0,100)
# add 45 degree line to left panel
x = np.linspace(*axes[0].get_xlim())
axes[0].plot(x, x, color='red')

axes[1].scatter(x=y_test, y=test_preds_1R) # first row, second entry (right top)
axes[1].set_xlabel("Actual", fontsize=10)
axes[1].set_ylabel("Predicted (TEST)",  fontsize=10)
# set plot limits
axes[1].set_xlim(0,100)
axes[1].set_ylim(0,100)
# add 45 degree line to right panel
x = np.linspace(*axes[1].get_xlim())
axes[1].plot(x, x, color='red')

fig.tight_layout()
fig.subplots_adjust(top=0.88)
plt.show()

# Model 2 - LassoCV

In [131]:
ModelL = LassoCV()
# define model
ModelL = ModelL.fit(X_train, y_train) 

# store the predictions
train_preds_L = ModelL.predict(X_train) 
test_preds_L = ModelL.predict(X_test)

Error metrics

In [132]:
# Train
print('Mean Absolute Error: ', mean_absolute_error(y_train,train_preds_L))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_train,train_preds_L))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_train,train_preds_L))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_train,train_preds_L))  # Calculate MAPE
print('R2: ', r2_score(y_train,train_preds_L))  # Calculate R2
print('Bias: ', np.mean(y_train - train_preds_L))

In [133]:
# Test
print('Mean Absolute Error: ', mean_absolute_error(y_test,test_preds_L))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_test,test_preds_L))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_test,test_preds_L))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_test,test_preds_L))  # Calculate MAPE
print('R2: ', r2_score(y_test,test_preds_L))  # Calculate R2
print('Bias: ', np.mean(y_test - test_preds_L))

# Model 3 - Decision Tree

In [134]:
Model3R = DecisionTreeRegressor(random_state=seed)
# define model
Model3R = Model3R.fit(X_train, y_train) 

# store the predictions
train_preds_3R = Model3R.predict(X_train) 
test_preds_3R = Model3R.predict(X_test)

Error Metrics

In [135]:
# Train
print('Mean Absolute Error: ', mean_absolute_error(y_train,train_preds_3R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_train,train_preds_3R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_train,train_preds_3R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_train,train_preds_3R))  # Calculate MAPE
print('R2: ', r2_score(y_train,train_preds_3R))  # Calculate R2
print('Bias: ', np.mean(y_train - train_preds_3R))

In [136]:
# Test
print('Mean Absolute Error: ', mean_absolute_error(y_test,test_preds_3R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_test,test_preds_3R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_test,test_preds_3R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_test,test_preds_3R))  # Calculate MAPE
print('R2: ', r2_score(y_test,test_preds_3R))  # Calculate R2
print('Bias: ', np.mean(y_test - test_preds_3R))

# Model 4 - Random Forest

In [137]:
Model4R = RandomForestRegressor(random_state=seed, max_depth=15, min_samples_leaf=10)
Model4R.fit(X_train, y_train)
test_preds_4R = Model4R.predict(X_test)
train_preds_4R = Model4R.predict(X_train)

Error Metrics

In [138]:
# Train
print('Mean Absolute Error: ', mean_absolute_error(y_train,train_preds_4R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_train,train_preds_4R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_train,train_preds_4R))# Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_train,train_preds_4R))  # Calculate MAPE
print('R2: ', r2_score(y_train,train_preds_4R))  # Calculate R2
print('Bias: ', np.mean(y_train - train_preds_4R))

In [139]:
# Test
print('Mean Absolute Error: ', mean_absolute_error(y_test,test_preds_4R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_test,test_preds_4R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_test,test_preds_4R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_test,test_preds_4R))  # Calculate MAPE
print('R2: ', r2_score(y_test,test_preds_4R))  # Calculate R2
print('Bias: ', np.mean(y_test - test_preds_4R))

In [140]:
fig, axes = plt.subplots(1,2) 
fig.suptitle("Model 4R Results", fontsize=20)

axes[0].scatter(x=y_train, y=train_preds_4R) 
axes[0].set_xlabel("Actual", fontsize=10)
axes[0].set_ylabel("Predicted (TRAIN)",  fontsize=10)
# set plot limits
axes[0].set_xlim(0,100)
axes[0].set_ylim(0,100)
# add 45 degree line to left panel
x = np.linspace(*axes[0].get_xlim())
axes[0].plot(x, x, color='red')

axes[1].scatter(x=y_test, y=test_preds_4R) # first row, second entry (right top)
axes[1].set_xlabel("Actual", fontsize=10)
axes[1].set_ylabel("Predicted (TEST)",  fontsize=10)
# set plot limits
axes[1].set_xlim(0,100)
axes[1].set_ylim(0,100)
# add 45 degree line to right panel
x = np.linspace(*axes[1].get_xlim())
axes[1].plot(x, x, color='red')

fig.tight_layout()
fig.subplots_adjust(top=0.88)
plt.show()

# Model 5 - Gradient Boosting

In [141]:
Model5R = GradientBoostingRegressor(random_state=seed)
Model5R.fit(X_train, y_train)
test_preds_5R = Model5R.predict(X_test)
train_preds_5R = Model5R.predict(X_train)

In [142]:
# Train
print('Mean Absolute Error: ', mean_absolute_error(y_train,train_preds_3R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_train,train_preds_3R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_train,train_preds_3R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_train,train_preds_3R))  # Calculate MAPE
print('R2: ', r2_score(y_train,train_preds_3R))  # Calculate R2
print('Bias: ', np.mean(y_train - train_preds_3R))

In [143]:
# Test
print('Mean Absolute Error: ', mean_absolute_error(y_test, test_preds_5R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_test,test_preds_5R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_test,test_preds_5R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_test,test_preds_5R))  # Calculate MAPE
print('R2: ', r2_score(y_test,test_preds_5R))  # Calculate R2
print('Bias: ', np.mean(y_test - test_preds_5R))

# Permutation Feature Importance

In [144]:
fig, ax = plt.subplots(1, 1, figsize=(40, 12)) 
plt.suptitle('Title', y= 1.05, size = 20)


# MODEL 4
tmp = permutation_importance(Model4R, X_test, y_test, n_repeats=10,
                                random_state=seed) 
perm_sorted_idx = tmp.importances_mean.argsort()
ax.boxplot(tmp.importances[perm_sorted_idx].T, vert=False,
            labels=X.columns[perm_sorted_idx])
ax.set_title('Model 4')
fig.tight_layout()
plt.show()

# Model 6 - Random Forest with most important features

In [152]:
imp_features= ['Complaint Type_Graffiti','Created Month','Created Year','Complaint Type_Vacant Lot','Borough_QUEENS','Borough_BROOKLYN','Complaint Type_Sanitation Condition',
               'Complaint Type_Snow','Location Type_Street','Borough_MANHATTAN', 'Estimated_Days']
df3 = df.loc[:,imp_features]

In [154]:
# Data splitting
X1 = df3.drop(['Estimated_Days'], axis=1) #drop the target
y1 = df3['Estimated_Days'] #target variable
seed = 42 

# split!
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1,
                                                    test_size = 0.2,
                                                    shuffle=True,
                                                    random_state = seed)

In [155]:
X_train1 = np.array(X_train1)
X_test1 = np.array(X_test1)
y_train1 = np.array(y_train1)
y_test1 = np.array(y_test1)

scaler = MinMaxScaler()
X_train1 = scaler.fit_transform(X_train1)
X_test1 = scaler.transform(X_test1)

In [156]:
Model6R = RandomForestRegressor(random_state=seed, max_depth=15, min_samples_leaf=10)
Model6R.fit(X_train1, y_train1)
test_preds_6R = Model6R.predict(X_test1)
train_preds_6R = Model6R.predict(X_train1)

In [157]:
# Train
print('Mean Absolute Error: ', mean_absolute_error(y_train1,train_preds_6R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_train1,train_preds_6R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_train1,train_preds_6R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_train1,train_preds_6R))  # Calculate MAPE
print('R2: ', r2_score(y_train1,train_preds_6R))  # Calculate R2
print('Bias: ', np.mean(y_train1 - train_preds_6R))

In [158]:
# Test
print('Mean Absolute Error: ', mean_absolute_error(y_test1,test_preds_6R))  # Calculate the mean absolute error
print('Median Absolute Error: ', median_absolute_error(y_test1,test_preds_6R))  # Calculate the median absolute error
print('MSE: ', mean_squared_error(y_test1,test_preds_6R))  # Calculate MSE
print('MAPE: ', mean_absolute_percentage_error(y_test1,test_preds_6R))  # Calculate MAPE
print('R2: ', r2_score(y_test1,test_preds_6R))  # Calculate R2
print('Bias: ', np.mean(y_test1 - test_preds_6R))

# Model Comparison

Comparing the existing model's performance, the Rsquare value is high and error is low for Random Forest in both train and test data sets. Here, the Rsquare value of train is almost similar to test data which is acceptable.     
The models can be further explored using hyperparameter tuning with GridSearchCV. There, we can apply preprocessing steps like scaling,PCA.