In [10]:
# Multiple Linear Regression

# Import the libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Import the dataset
data = pd.read_csv('Dataset/50_Startups.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

In [11]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [12]:
X

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [13]:
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

In [14]:
# One Hot Encoding categorical data
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

TypeError: __init__() got an unexpected keyword argument 'categorical_features'

In [15]:
X

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1],
       [131876.9, 99814.71, 362861.36, 2],
       [134615.46, 147198.87, 127716.82, 0],
       [130298.13, 145530.06, 323876.68, 1],
       [120542.52, 148718.95, 311613.29, 2],
       [123334.88, 108679.17, 304981.62, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [93863.75, 127320.38, 249839.44, 1],
       [91992.39, 135495.07, 252664.93, 0],
       [119943.24, 156547.42, 256512.92, 1],
       [114523.61, 122616.84, 261776.23, 2],
       [78013.11, 121597.55, 264346.06, 0],
       [94657.16, 145077.58, 282574.31, 2],
       [91749.16, 114175.79, 294919.57, 1],
       [86419.7, 153514.11, 0.0, 2],
       [76253.86, 113867.3, 298664.47, 0],
       [78389.47, 153773.43, 299737.29, 2],
       [73994.56, 122782.75, 30331

In [16]:


# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
# Fitting Multiple Linear Regression to the Training set
regression = LinearRegression()
regression.fit(X_train, y_train)

LinearRegression()

In [18]:
X_test

array([[66051.52, 182645.56, 118148.2, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [27892.92, 84710.77, 164470.71, 1],
       [153441.51, 101145.55, 407934.54, 1],
       [72107.6, 127864.55, 353183.81, 2],
       [20229.59, 65947.93, 185265.1, 2],
       [61136.38, 152701.92, 88218.23, 2],
       [73994.56, 122782.75, 303319.26, 1],
       [142107.34, 91391.77, 366168.42, 1]], dtype=object)

In [19]:
# Predicting the Test set results
y_pred = regression.predict(X_test)

In [20]:
df = pd.DataFrame(data=y_test, columns=['y_test'])
df['y_pred'] = y_pred

In [21]:
df

Unnamed: 0,y_test,y_pred
0,103282.38,103959.40508
1,144259.4,132398.732367
2,146121.95,133529.037901
3,77798.83,72958.283683
4,191050.39,179534.787372
5,105008.31,115533.625841
6,81229.06,67476.958479
7,97483.56,98504.361994
8,110352.25,114789.160838
9,166187.94,168972.219099


In [22]:
# Predicting the sigle observation results. Here 1,0,0 represents that the state is Calfornia
a = [1,0,0,160349,134321,401400]
b = np.array(a)
b = b.reshape(1, -1)
y_pred_single_obs = regression.predict(b)
round(float(y_pred_single_obs), 2)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 4 is different from 6)

In [23]:
#Model Evaluation
'''This calculates the coefficient of determination or the r^2 of the model. This can give a score between -1 and 1. 
Scores closer to -1 giving a negative impact on the model and scores closer to 1 give a positive impact to the model. 
In our case, we have 0.93 which is close to 1 which indicates that we have a pretty good model.'''
r2_score(y_test, y_pred)

0.9386861070938133

In [29]:
#Saving the model
'''scikit-learn has their own model persistence method we will use: joblib. 
This is more efficient to use with scikit-learn models due to it being better at handling larger numpy arrays that may 
be stored in the models.'''

import joblib
joblib.dump(regression, "multiple_regression_model.pkl")

['multiple_regression_model.pkl']

In [30]:
NewYork = 1
California = 0
Florida = 0
RnD_Spend = 160349
Administration_Spend = 134321
Marketing_Spend = 401400
pred_args = [NewYork,California,Florida,RnD_Spend,Administration_Spend,Marketing_Spend]
pred_args_arr = np.array(pred_args)
pred_args_arr = pred_args_arr.reshape(1, -1)
mul_reg = open("multiple_regression_model.pkl","rb")
ml_model = joblib.load(mul_reg)
model_prediction = ml_model.predict(pred_args_arr)

round(float(model_prediction), 2)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 4 is different from 6)