# Multiple Linear Regression

# Importing the libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

# Importing the dataset

In [3]:
dataset = pd.read_csv("Class Project/CompaniesProfit.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(x[:5,:])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]


# Encoding categorical data

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [6]:
print(x)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

# Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

# Feature Scaling

In [8]:
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
x_train = sds.fit_transform(x_train)
x_test = sds.transform(x_test)

In [9]:
print(x_train[:5])

[[-0.8660254   2.         -0.76870611  1.17644103  0.84515251  0.94354978]
 [-0.8660254   2.         -0.76870611  0.96420324  1.27283565  0.42738817]
 [-0.8660254  -0.5         1.30088727 -1.47369826  0.0153175  -1.52350329]
 [-0.8660254  -0.5         1.30088727 -1.48308929 -2.79556363 -1.53809178]
 [-0.8660254  -0.5         1.30088727 -0.14952431  1.13637282 -0.71716495]]


In [10]:
print(x_test[:5])

[[-0.8660254   2.         -0.76870611 -0.1403821   2.28593993 -0.63280437]
 [ 1.15470054 -0.5        -0.76870611  0.5692117  -1.24096039  0.37552686]
 [-0.8660254   2.         -0.76870611  0.59465017 -0.51102691  0.21780907]
 [-0.8660254   2.         -0.76870611 -0.92249538 -1.51579286 -0.27786722]
 [-0.8660254   2.         -0.76870611  1.65079661 -0.87781077  1.58762665]]


# Training the Multiple Linear Regression model on the Training set

In [11]:
from sklearn.linear_model import SGDRegressor
# regressor = SGDRegressor()
regressor = SGDRegressor(alpha=0.0001, epsilon=0.01, eta0=0.1,penalty='elasticnet')
regressor.fit(x_train, y_train)

# Predicting the Test set results

In [12]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[104291.47 103282.38]
 [132710.91 144259.4 ]
 [133467.25 146121.95]
 [ 72628.28  77798.83]
 [179199.23 191050.39]
 [114163.9  105008.31]
 [ 66089.19  81229.06]
 [ 97887.23  97483.56]
 [114441.15 110352.25]
 [168660.19 166187.94]
 [ 96448.38  96778.92]
 [ 87550.3   96479.51]
 [110752.51 105733.54]
 [ 91274.78  96712.8 ]
 [127760.98 124266.9 ]]


# Evaluating the Model Performance

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9347623223095752

# Checking Training Accuracy 

In [14]:
cv_score = cross_val_score(regressor, x_train, y_train, cv = 10)
print("CV mean score: ", cv_score.mean())

CV mean score:  0.8424537375351673


In [15]:
import pickle
pickel_out = open("SGD.pkl", "wb")
pickle.dump(regressor, pickel_out)