<a href="https://colab.research.google.com/github/christopher-c-lee/machine-learning-projects/blob/main/ContinuousPrediction_Section_A_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Continuous Targets 





In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns

In [2]:
car_df = pd.read_csv("ToyotaCorollaPrices.csv")

FileNotFoundError: ignored

In [None]:
car_df.head(10)

### Just a quick exploration of the data 

In [None]:
car_df.columns

In [None]:
car_df.Fuel_Type.value_counts()

In [None]:
car_df.Color.value_counts()

In [None]:
car_df.HP.value_counts()

In [None]:
car_df.head()

In [None]:
car_df.Color.value_counts()

In [None]:
sns.displot(x =car_df["Price"])

In [None]:
sns.boxplot(y =car_df["Price"], x = car_df["Fuel_Type"])

In [None]:
sns.relplot(y =car_df["Price"], x = car_df["KM"], hue = car_df["Fuel_Type"])

In [None]:
sns.relplot(y =car_df["Price"], x = car_df["Age"], hue = car_df["Fuel_Type"])

#### Now let us start bulding predcitive models- we will use the following models:
    1. Nearest Neighbors
    2. Decision Trees 
    3. Random Forests 


#### First we have to convert categorical variable into numerical - dummy variables

In [None]:
car_df = pd.get_dummies(car_df, drop_first = True) # more about drop_first next week 

In [None]:
car_df

#### Before applying nearest neighbors we have scale the predictor data

In [None]:
from sklearn.preprocessing import MinMaxScaler # importing what we need for doing scaling. 

In [None]:
ourscaler = MinMaxScaler() # first creating a MinMax scaler object

In [None]:
car_df_scaledPredictors = pd.DataFrame(ourscaler.fit_transform(car_df.drop(columns={"Price"})),
                               index=car_df.index, columns=car_df.columns.drop("Price")) 


# we dont need to scale the target 

In [None]:
car_df_scaledPredictors

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_predictors, valid_predictors, train_target, valid_target = train_test_split(car_df_scaledPredictors, 
                                                    car_df[["Price"]], test_size=0.3, random_state=1)

In [None]:
train_predictors.head()

In [None]:
 train_target.head()

#### Now build Nearest Neighbors Model 

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knnmodel = KNeighborsRegressor(n_neighbors=10) # Creating a knn model object with number neighbors = 5 
knnmodel.fit(train_predictors,train_target)

#### Now compute predictions 

In [None]:
knn_valid_predict = pd.DataFrame(knnmodel.predict(valid_predictors), index = valid_target.index, columns = {"knn_prediction"}) 

In [None]:
knn_valid_predict

In [None]:
valid_target

In [None]:
!pip install dmba
from dmba import regressionSummary

In [None]:
print("TRAINING")
regressionSummary(train_target, knnmodel.predict(train_predictors))
print()
print("VALIDATION")
regressionSummary(valid_target, knnmodel.predict(valid_predictors))

### Now let us build a Decision Tree Regressor - for which we don't need scaled data. In fact we should not used scaled data since the decision Tree would lose its interpetability to the business. 
#### Note that car_df has the unscaled predcitors 


In [None]:
### 
train_predictors, valid_predictors, train_target, valid_target = train_test_split(car_df.drop(columns = {"Price"}), 
                                                    car_df[["Price"]], test_size=0.3, random_state=1)

In [None]:
train_predictors

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Define the Regression Tree model and fit it to Training data 
regTree = DecisionTreeRegressor(max_depth= 3, min_samples_split = 20)
regTree.fit(train_predictors,train_target)

In [None]:
print("TRAINING")
regressionSummary(train_target, regTree.predict(train_predictors))
print()
print("VALIDATION")
regressionSummary(valid_target, regTree.predict(valid_predictors))

#### In this case, Decision Trees seem to come up with a more accurate model ! And it does not seem to be overfitting 

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pylab as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(18,12))  # set plot size (denoted in inches)
plot_tree(regTree, feature_names=list(train_predictors.columns),filled=True, fontsize=10)
plt.show



#### Now we apply Random Forests 

In [None]:
from sklearn.ensemble import RandomForestRegressor 

In [None]:
rf_toyota= RandomForestRegressor(n_estimators=20, random_state=1)
rf_toyota.fit(train_predictors,train_target)


In [None]:
print("TRAINING")
regressionSummary(train_target, rf_toyota.predict(train_predictors))
print()
print("VALIDATION")
regressionSummary(valid_target, rf_toyota.predict(valid_predictors))

### Now we will apply Linear Regression 

In [None]:
from sklearn.linear_model import LinearRegression

import matplotlib.pylab as plt

from dmba import regressionSummary
%matplotlib inline 

In [None]:
# # Define a Linear Regression model and fit it to Training data

linreg_toyota = LinearRegression()
linreg_toyota.fit(train_predictors,train_target)   # this step finds the best beta parameters 



In [None]:
# we can print the best beta coefficients
print('intercept or beta0 = ',linreg_toyota.intercept_[0])
BetaMatrix = (pd.DataFrame({'Predictor': train_predictors.columns, 'Beta_coefficient': linreg_toyota.coef_[0] }))
BetaMatrix

#### This means that for THIS predictive model 
 
 #### Predicted Price =
 
 #### The coefficients have interesting interpretations:

#### If everything else remains the same: 
     1. The sale price of car decreases by  Euros for every additional month of Age 
     2. The sale price of car decreases by  Euros for every additional KM on the car.
     3. Having Diesel as fuel type provides a increase in sale price of  euros over an identical car with CNG Fuel 
     4. Having Petrol as fuel type provides a increase in sale price of euros over an identical car with CNG Fuel 

In [None]:
valid_target["Predicted_Price"] = linreg_toyota.predict(valid_predictors)

In [None]:
valid_target.head(20)

In [None]:
valid_target["Error_PricePrediction"] = (valid_target.Predicted_Price - valid_target.Price) 

In [None]:
valid_target.head(20)

In [None]:
sns.relplot(data=valid_target, x="Price", y="Predicted_Price")

In [None]:
sns.relplot(data=valid_target[(valid_target.Predicted_Price>= 0)], x="Price", y="Predicted_Price")

In [None]:
valid_results.Predicted_Price.describe()

In [None]:
valid_predictors[(valid_target.Predicted_Price < 0)]

#### Seems like a data entry error.  

In [None]:
valid_predictors.head(20)

#### Let us drop that row from the validation dataset 

In [None]:
valid_predictors = valid_predictors.drop(index = {80})

In [None]:
valid_target = valid_target.drop(index = {80}) 

In [None]:
valid_results = valid_results.drop(index = {80})

In [None]:
valid_predictors.shape

In [None]:
valid_target.shape

In [None]:
sns.relplot(data=valid_results, x="Price", y="Predicted_Price")

In [None]:
print("TRAINING - Linear Regression")
regressionSummary(train_target,linreg_toyota.predict(train_predictors))
print()
print("VALIDATION -Linear Regression")
regressionSummary(valid_target["Price"], regTree.predict(valid_predictors))

####  Class Exercise:
    1. Build a linear regression model using just two predictors Age and KM  ?
    2. How good is this model ? 
    
    Hint: In the above code every where you see train_predictors or valid_predictors, you have to replace it with train_predictors["Age","KM] and valid_predictors["Age", "KM"]
