### From the book "Python Machine Learning for Beginners" by AI Publishing

6.1 Preparing Data for Regression Problems

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
# List the datasets in the Seaborn library
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [4]:
# To reada particular dataset into the Pandas dataframe, pass the datasetname to 
# the load_dataset() method of the Seaborn library
tips_df = sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
# Loads the Diamonds dataset and displays its first five rows
diamond_df = sns.load_dataset("diamonds")
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
# As afirst step, we divide the data into features and labels sets. 
# Our labels set consists of values from the “tip” column, while the features set consists of values 
# from the remaining columns
X = tips_df.drop(['tip'], axis=1)
y = tips_df['tip']

In [7]:
# Print the feature set
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [8]:
# Print the label set
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

In [9]:
# Machine learning algorithms, for the most part, can only work with numbers.Therefore, 
# it is important to convert categorical data into a numeric format.In this regard, 
# the first step is to create a dataset of all numeric values. 
# To do so, drop the categorical columns from the dataset.
numerical = X.drop(['sex', 'smoker', 'day', 'time'], axis = 1)

In [10]:
# Shows that the dataframe “numerical” contains numericcolumns only
numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [11]:
# Create a dataframe that contains only categorical columns
categorical = X.filter(['sex', 'smoker', 'day', 'time'])
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [12]:
# One of the most common approaches to convert a categorical column to a 
# numeric one is via one-hot encoding. In one-hot encoding, for every unique 
# value in the original columns, a new column is created.

# For instance, for sex, two columns: Female and Male, are created. If the 
# original sex column contained male, a 1 is added in the newly created Male 
# column, while 1 is added in the newly created Female column if the 
# original sex column contained Female.

# However, it can be noted that we do not really need two columns. A single 
# column, i.e., Female is enough since when a customer is female, we can add 1 
# in the Female column, else 1 can be added in that column. Hence,we 
# need N-1 one-hot encoded columns for all the N values in the original column.

# Let's convert categorical columns into one-hot encoded columns using 
# the pd.get_dummies() method.
cat_numerical = pd.get_dummies(categorical, drop_first=True)
cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [13]:
# Join the numerical columns with the one-hot
# encoded columns
X = pd.concat([numerical, cat_numerical], axis = 1)
X.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


In [15]:
# Divide the dataset intotwo sets, i.e., a training set and a test set. 
# The dataset is trained via thetraining set and evaluated on the test set.
# The following script divides the data into an 80 percenttraining set 
# and a 20 percent test set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [19]:
# Scale the data. You can see that some columns of the dataset contain small 
# values, while the others contain very large values. It is better to convert 
# all values to a uniform scale.
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
# Scaling the training set
X_train = sc.fit_transform(X_train)
# Scaling the test set
X_test = sc.transform(X_test)

6.2 Linear Regression

In [20]:
# To train the algorithm, the training and test sets, i.e., X_train and X_test 
# in our case, are passed to the fit() method of the object of the 
# LinearRegression class. The test set is passed to the predict() method of 
# the class to make predictions.
from sklearn.linear_model import LinearRegression

# Training the algorithm
lin_reg = LinearRegression()
regressor = lin_reg.fit(X_train, y_train)

# Making predictions on test set
y_pred = regressor.predict(X_test)

In [21]:
# Once you have trained a model and have made predictions on the test set,
# the next step is to know how well has your model performed for making predictions 
# on the unknown test set. There are various metrics to check that. 
# However, mean absolute error, mean squared error, and root meansquared error 
# are three of the most common metrics
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  0.708021883297983
Mean Squared Error:  0.8939195221609613
Root Mean Squared Error:  0.9454731736865734


In [None]:
# By looking at the mean absolute error, it can be concluded that on average, there is an 
# error of 0.70 for predictions, which means that on average, the predicted tip values 
# are 0.70$ more or less than the actual tip values.

6.3 KNN Regression

In [24]:
# With Sklearn, it is extremely easy to implement KNN regression. To do so, you can use 
# the KNeighborsRegressor class. The process of training and testing is the same as linear 
# regression. For training, you need to call the fit() method, and for testing, 
# you need to call the predict() method.
# The following script shows the process of training, testing, and evaluatingthe 
# KNN regression algorithm for predicting the values for the tip column from the Tips dataset.
from sklearn.neighbors import KNeighborsRegressor

KNN_reg = KNeighborsRegressor(n_neighbors=5)
regressor = KNN_reg.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error:  0.7513877551020406
Mean Squared Error:  0.9462902040816326
Root Mean Squared Error:  0.9727744877830794


6.4 Random Forest Regression

In [25]:
# RandomForestRegressor class from the Sklearn.ensemble module 
# can be used to implement random forest regressor algorithms
from sklearn.ensemble import RandomForestRegressor

# Training and testing the random forest
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)
regressor = rf_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  0.7054065306122449
Mean Squared Error:  0.8045782841306138
Root Mean Squared Error:  0.8969828783932354


In [26]:
# The mean absolute error value of 0.70 shows that random forest 
# performs better than both linear regression and KNN for 
# predicting tip in the Tipsdataset

6.5 Support Vector Regression

In [27]:
# With the Sklearn library, you can use the SVM class to implement 
# support vector regression algorithms
from sklearn import svm

# Training and testing the SVM
svm_reg = svm.SVR()

regressor = svm_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  0.7362521512772692
Mean Squared Error:  0.9684825097223088
Root Mean Squared Error:  0.9841150896731077


6.6 K Fold Cross-Validation

In [31]:
# With K-fold cross-validation, the data is divided into K parts. The experiments are also performed 
# for K parts. In each experiment, K-1 parts are used for training, and the Kth part is used for testing.
# For example, in 5-fold cross-validation, the data is divided into five equal parts, e.g., K1, K2, K3, 
# K4, and K5. In the first iteration, K1–K4 are used for training, while K5 is used for testing. 
# In the second test, K1, K2, K3, and K5 are used for training, and K4 is used for testing. 
# In this way, each part isused at least once for testing and once for training.
# You can use cross_val_score() function from the sklearn.model_selectionmodule to 
# perform cross validation. 
from sklearn.model_selection import cross_val_score

print(cross_val_score(regressor, X, y, cv=5, scoring='neg_mean_absolute_error'))

[-0.74119767 -0.65235278 -0.6900026  -1.03591702 -0.90284679]




In [32]:
# The output shows the mean absolute value for each of 
# the K folds.

6.7 Making Prediction on a Single Record

In [33]:
#  In this section, you will see how to make a prediction 
# using a single record as an input.
# Pick the 100th record from our dataset
tips_df.loc[100]

total_bill     11.35
tip              2.5
sex           Female
smoker           Yes
day              Fri
time          Dinner
size               2
Name: 100, dtype: object

In [35]:
# We will try to predict the value of the tip of the 100th record 
# using the random forest regressor algorithm and see what output we get. 
# Note that you have to scale your single record before it can be 
# used as input to your machine learning algorithm.
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)
regressor = rf_reg.fit(X_train, y_train)

single_record = sc.transform(X.values[100].reshape(1, -1))
predicted_tip = regressor.predict(single_record)
print(predicted_tip)

[2.2609]


In [None]:
# The predicted value of the tip is 2.26, which is pretty close to 2.5, i.e., the actual value.