# Multiple Linear Regression

# Importing the libraries

In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

# Importing the dataset

In [41]:
dataset = pd.read_csv("Class Project/CompaniesProfit.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [42]:
print(x[:5,:])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]


# Encoding categorical data

In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [44]:
print(x)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

# Splitting the dataset into the Training set and Test set

In [45]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

# Feature Scaling

In [46]:
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
x_train = sds.fit_transform(x_train)
x_test = sds.transform(x_test)

In [47]:
print(x_train[:5])

[[-0.87  2.   -0.77  1.18  0.85  0.94]
 [-0.87  2.   -0.77  0.96  1.27  0.43]
 [-0.87 -0.5   1.3  -1.47  0.02 -1.52]
 [-0.87 -0.5   1.3  -1.48 -2.8  -1.54]
 [-0.87 -0.5   1.3  -0.15  1.14 -0.72]]


In [48]:
print(x_test[:5])

[[-0.87  2.   -0.77 -0.14  2.29 -0.63]
 [ 1.15 -0.5  -0.77  0.57 -1.24  0.38]
 [-0.87  2.   -0.77  0.59 -0.51  0.22]
 [-0.87  2.   -0.77 -0.92 -1.52 -0.28]
 [-0.87  2.   -0.77  1.65 -0.88  1.59]]


# Training the Multiple Linear Regression model on the Training set

In [49]:
from sklearn.linear_model import SGDRegressor
# regressor = SGDRegressor()
regressor = SGDRegressor(alpha=0.0001, epsilon=0.01, eta0=0.1,penalty='elasticnet')
regressor.fit(x_train, y_train)

# Predicting the Test set results

In [50]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[104690.71 103282.38]
 [132020.77 144259.4 ]
 [134084.67 146121.95]
 [ 73355.91  77798.83]
 [179917.72 191050.39]
 [115310.14 105008.31]
 [ 67317.14  81229.06]
 [ 98768.07  97483.56]
 [115128.69 110352.25]
 [169383.15 166187.94]
 [ 97135.43  96778.92]
 [ 88713.49  96479.51]
 [111245.8  105733.54]
 [ 90466.34  96712.8 ]
 [128438.21 124266.9 ]]


# Evaluating the Model Performance

In [51]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.936325565078478

# Checking Training Accuracy 

In [52]:
cv_score = cross_val_score(regressor, x_train, y_train, cv = 10)
print("CV mean score: ", cv_score.mean())

CV mean score:  0.8478592472268021
