# Solving Regression Problems in Machine Learning using Sklearn Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
#Using built in datasets 

# Reading in the tips dataset

tips_df = sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# Reading in the Diamonds dataset

diamond_df = sns.load_dataset("diamonds")
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
# We will use the tips dataset to predict the tip for a particular record

# Dividing Data into Features and Labels

In [5]:
# Divinding the dataset into Target and Features

#Features

X = tips_df.drop(["tip"],axis=1)
y = tips_df["tip"]

In [6]:
#Examining the feature set

X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [7]:
#Examining the target variable

y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

# Converting Categorical Data to Numbers

In [8]:
# Machine learning Algorithms can only work with numbers so it is important to convert the categorical data into numeric format

# First drop the categorical columns from the dataset

numerical = X.drop(["sex","smoker","day","time"],axis=1)

#Showing the new numerical dataset

numerical.head()


Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [9]:
# Now creating a dataframe that contains only categorical columns

categorical = X.filter(["sex","smoker","day","time"])

#showing the new categorical dataset
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [10]:
#Using one-hot encoding to convert the categorical columns using the pd.get_dummies() method

cat_numerical = pd.get_dummies(categorical,drop_first=True)

cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [11]:
# Next step is to join the numerical columns with the one-hot encoded columns using the concat function

X = pd.concat([numerical,cat_numerical],axis = 1)
X.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


# Divide Data into Training and Test Sets

In [12]:
# Dividing the Data into Training and Test sets using the train_test_split() function from the sklearn library
# 80% training set and 20% test set 


from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

# Data Scaling/Normalization

In [13]:
# Using StandardScaler() to convert all values to a uniform scale

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

#scaling the training set
X_train = sc.fit_transform(X_train)

#scaling the test set

X_test = sc.transform(X_test)

# Linear Regression 

In [15]:
from sklearn.linear_model import LinearRegression

#training the model
lin_reg = LinearRegression()

regressor = lin_reg.fit(X_train,y_train)

#making predictions on test set

y_pred = regressor.predict(X_test)

In [16]:
#Using Mean Absolute Error, Mean Squared Error and Root mean squared as the metrics to test the model

from sklearn import metrics

print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

Mean Absolute Error:  0.708021883297983
Mean Squared Error:  0.8939195221609613
Root Mean Squared Error:  0.9454731736865734


In [17]:
# Looking at the Mean Absolute Error it can be concluded that on average there is an error of 0.70 for predictions which means 
# That on average, the predicted tip values are 0.70$ more or less than the actual tip values

# KNN Regression

In [19]:
#Using KNN to predict the values for the tip column

from sklearn.neighbors import KNeighborsRegressor

KNN_reg = KNeighborsRegressor(n_neighbors = 5)

regressor = KNN_reg.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

In [20]:
#Using Mean Absolute Error, Mean Squared Error and Root mean squared as the metrics to test the model

print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

Mean Absolute Error:  0.7513877551020406
Mean Squared Error:  0.9462902040816326
Root Mean Squared Error:  0.9727744877830794


In [21]:
# Since the Errors are higher it appears that Linear Regression is the better model. But it could also be the the value we picked
# for the number of neighbors.

# Random Forest Regression

In [22]:
# Training and testing the random forest

from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=42,n_estimators=500)
regressor = rf_reg.fit(X_train,y_train)
y_pred = regressor.predict(X_test)


# evaluating algorithm performance

print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

Mean Absolute Error:  0.7045665306122448
Mean Squared Error:  0.8020627395265322
Root Mean Squared Error:  0.8955795551074913


In [23]:
# Looking at the Mean Absolute Error it looks like the Random Forest Regression did slightly better than linear regression and
# and much better than KNN Regression

# Support Vector Regression

In [24]:
# Training and testing the SVM

from sklearn import svm

svm_reg = svm.SVR()
regressor = svm_reg.fit(X_train,y_train)
y_pred = regressor.predict(X_test)


# evaluating algorithm performance

print("Mean Absolute Error: ",metrics.mean_absolute_error(y_test,y_pred))
print("Mean Squared Error: ",metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

Mean Absolute Error:  0.7362521512772694
Mean Squared Error:  0.9684825097223093
Root Mean Squared Error:  0.9841150896731079


In [25]:
# Looks like Random Forest is still the better model

# K Fold Cross-Validation

In [26]:
# Using K Fold Cross-Validation on the SVM example above

from sklearn.model_selection import cross_val_score

print(cross_val_score(regressor,X,y,cv=5,scoring="neg_mean_absolute_error"))

[-0.66386205 -0.57007269 -0.63598762 -0.96960743 -0.87391702]


# Making a Prediction on a Single Record

In [27]:
# Picking the 100th record from our dataset to make a prediction on
tips_df.loc[100]

total_bill     11.35
tip              2.5
sex           Female
smoker           Yes
day              Fri
time          Dinner
size               2
Name: 100, dtype: object

In [29]:
# Trying to predict the value of the tip of the 100th record using the random forest regressor

rf_reg = RandomForestRegressor(random_state=42,n_estimators=500)
regressor = rf_reg.fit(X_train,y_train)

single_record = sc.transform(X.values[100].reshape(1,-1))
predicted_tip = regressor.predict(single_record)
print(predicted_tip)

[2.26622]


In [None]:
# The actual value is 2.5 so the predicted value was pretty close. 