In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import  LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [48]:
df = pd.read_csv('DeliveryTime.csv')

df.head(1)

Unnamed: 0,milesTraveled,numDeliveries,gasPrice,travelTime
0,89,4,3.84,7.0


In [49]:
stats.pearsonr(df.gasPrice, df.travelTime)
stats.pearsonr(df.numDeliveries, df.travelTime)
stats.pearsonr(df.milesTraveled, df.travelTime)

PearsonRResult(statistic=np.float64(0.9709435024360539), pvalue=np.float64(1.8253918485697388e-31))

In [50]:
# determine X and Y
X = df[['milesTraveled']]
Y = df.travelTime

In [51]:
# split into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [52]:
# build and train using training data
my_model = LinearRegression()
my_model.fit(X_train, Y_train)

In [53]:
# evaluate model
Y_predicted = my_model.predict(X_test)

print('R2:', r2_score(Y_test, Y_predicted))
print('MSE:', mean_squared_error(Y_test, Y_predicted))

R2: 0.8770406383782032
MSE: 0.08004727198597678


In [58]:
# save the model
import joblib
joblib.dump(my_model, "TravelTimePredictor.pk")

['TravelTimePredictor.pk']

In [68]:
# load and use the model
# closer to 0 means it does less
# higher makes it more simple

model = joblib.load("TravelTimePredictor.pk")
new_records = pd.DataFrame({'milesTraveled': [70, 72]})
predictions = model.predict(new_records)
print(predictions)


[5.91056601 5.9982529 ]


In [69]:
# alpha is penalization term

from sklearn.linear_model import Lasso
my_lasso = Lasso(alpha = 1.0)
my_lasso.fit(X_train, Y_train)


In [70]:
# alpha is penalization term

from sklearn.linear_model import Ridge
my_ridge = Ridge(alpha = 1.0)
my_ridge.fit(X_train, Y_train)

In [66]:
# alpha is penalization term
# l1 ratio is ratio of lasso compared to ridge
from sklearn.linear_model import ElasticNet
my_elasticNet = ElasticNet(alpha = 1.0, l1_ratio = 0.5)
my_elasticNet.fit(X_train, Y_train)

In [71]:
### Exercise

In [73]:
# use income.csv to predict income

In [82]:
df2 = pd.read_csv('income.csv')

In [83]:
df2.describe()

Unnamed: 0,Education,Experience,Income
count,200.0,200.0,200.0
mean,16.07,19.485,96835.002018
std,2.63435,10.780113,20806.952181
min,12.0,0.0,40359.044735
25%,14.0,11.0,84028.674463
50%,16.0,21.0,98657.506189
75%,18.0,29.0,111077.466993
max,20.0,35.0,141498.886757


In [88]:
# determine X and Y
X = df.iloc[:, :-1]
Y = df.iloc[:,[-1]]

In [89]:
# do data pre processing
# scale it as income is very different to education and experience

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_transformed = scaler.fit_transform(X)
Y_transformed = scaler.fit_transform(Y)

In [100]:
# split data into train and test split

X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y_transformed)


In [101]:
# build model

LR = LinearRegression()
LR.fit(X_train, Y_train)

In [102]:
# evaluate

Y_predicted = LR.predict(X_test)
r2_score(Y_predicted, Y_test)

0.9133080884571724