**Multiple Linear Regression - Sklearn**

**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math
import time
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, mean_absolute_error, explained_variance_score

**Load Dataset**

In [3]:
data = pd.read_csv('/content/drive/MyDrive/dataset/50_Startups.csv')
print(data.shape)
data.head(5)

(50, 5)


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# Assuming 'df' is already defined
fig = px.scatter_3d(data, x='R&D Spend', y='Administration', z='Marketing Spend', color='State',  # Color-coding the fourth feature
                    size_max=18)

# Update the layout to adjust the figure size
fig.update_layout(
    width=600,  # Width of the figure in pixels
    height=600  # Height of the figure in pixels
)

fig.show()

In [5]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

In [6]:
print(X[:5])
print(y[:5])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]
[192261.83 191792.06 191050.39 182901.99 166187.94]


In [7]:
# Encoding the categorical data
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
print(X[:5])
# Apply OneHotEncoder and avoid the dummy variable trap by skipping the first column of the encoded data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [3])], remainder='passthrough')
X = ct.fit_transform(X).astype(float)
print(X[:5])

[[165349.2 136897.8 471784.1 2]
 [162597.7 151377.59 443898.53 0]
 [153441.51 101145.55 407934.54 1]
 [144372.41 118671.85 383199.62 2]
 [142107.34 91391.77 366168.42 1]]
[[0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05 4.7178410e+05]
 [0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05 4.4389853e+05]
 [1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05 4.0793454e+05]
 [0.0000000e+00 1.0000000e+00 1.4437241e+05 1.1867185e+05 3.8319962e+05]
 [1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04 3.6616842e+05]]


In [8]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
print(X[:5])

[[1.0000000e+00 1.6534920e+05 1.3689780e+05 4.7178410e+05]
 [0.0000000e+00 1.6259770e+05 1.5137759e+05 4.4389853e+05]
 [0.0000000e+00 1.5344151e+05 1.0114555e+05 4.0793454e+05]
 [1.0000000e+00 1.4437241e+05 1.1867185e+05 3.8319962e+05]
 [0.0000000e+00 1.4210734e+05 9.1391770e+04 3.6616842e+05]]


In [9]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40, 4), (10, 4), (40,), (10,))

In [10]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)
# sc_y = StandardScaler()
# y_train = sc_y.fit_transform(y_train)

**Train**

In [11]:
# Fitting Multiple Linear Regression to the Training set
lr = LinearRegression()
lr.fit(X_train,y_train)

**Prediction**

In [12]:
y_pred = lr.predict(X_test)
y_pred

array([128650.9969388 ,  85310.26216325, 155659.42581313,  45363.81714107,
        96078.59126456,  98236.41354081, 126480.95537885,  52896.49086871,
        47178.7521052 , 157822.64865485])

In [13]:
print("MAE",mean_absolute_error(y_test,y_pred))
print("MSE",mean_squared_error(y_test,y_pred))
print("R2 score",r2_score(y_test,y_pred))

MAE 7689.045541278436
MSE 93167802.81512256
R2 score 0.946982427873623


In [14]:
# # This for if you have 2 features columns
# x = np.linspace(-5, 5, 10)
# y = np.linspace(-5, 5, 10)
# xGrid, yGrid = np.meshgrid(y, x)
# z_final = lr.predict(final).reshape(10,10)
# z = z_final
# final = np.vstack((xGrid.ravel().reshape(1,100),yGrid.ravel().reshape(1,100))).T

In [15]:
# Use mean values for missing features
rd_spend_mean = np.mean(X_train[:, 0])  # Assuming the first feature is R&D Spend
admin_mean = np.mean(X_train[:, 1])     # Assuming the second feature is Administration
x = np.linspace(-5, 5, 10)
y = np.linspace(-5, 5, 10)
xGrid, yGrid = np.meshgrid(x, y)  # Note that the order here should be consistent

# Create a grid for the first two features and fill the other two with mean values
final = np.column_stack((
    np.full(xGrid.ravel().shape, rd_spend_mean),   # Fixed R&D Spend
    np.full(xGrid.ravel().shape, admin_mean),      # Fixed Administration
    xGrid.ravel(),                                 # xGrid (or another feature)
    yGrid.ravel()                                  # yGrid (or another feature)
))

z_final = lr.predict(final).reshape(xGrid.shape)

final = np.vstack((xGrid.ravel().reshape(1,100),yGrid.ravel().reshape(1,100))).T

In [16]:
final

array([[-5.        , -5.        ],
       [-3.88888889, -5.        ],
       [-2.77777778, -5.        ],
       [-1.66666667, -5.        ],
       [-0.55555556, -5.        ],
       [ 0.55555556, -5.        ],
       [ 1.66666667, -5.        ],
       [ 2.77777778, -5.        ],
       [ 3.88888889, -5.        ],
       [ 5.        , -5.        ],
       [-5.        , -3.88888889],
       [-3.88888889, -3.88888889],
       [-2.77777778, -3.88888889],
       [-1.66666667, -3.88888889],
       [-0.55555556, -3.88888889],
       [ 0.55555556, -3.88888889],
       [ 1.66666667, -3.88888889],
       [ 2.77777778, -3.88888889],
       [ 3.88888889, -3.88888889],
       [ 5.        , -3.88888889],
       [-5.        , -2.77777778],
       [-3.88888889, -2.77777778],
       [-2.77777778, -2.77777778],
       [-1.66666667, -2.77777778],
       [-0.55555556, -2.77777778],
       [ 0.55555556, -2.77777778],
       [ 1.66666667, -2.77777778],
       [ 2.77777778, -2.77777778],
       [ 3.88888889,

In [17]:
import plotly.graph_objects as go
import plotly.express as px

# Scatter plot
fig = px.scatter_3d(data, x='R&D Spend', y='Administration', z='Marketing Spend', color='State', size_max=18)

# Convert px scatter plot to a go.Figure object
fig = go.Figure(fig)

# Add surface plot
fig.add_trace(go.Surface(x=x, y=y, z=z_final, opacity=0.7, colorscale='Viridis'))

# Update layout if needed
fig.update_layout(
    width=800,
    height=600,
    title="3D Scatter Plot with Regression Surface"
)

fig.show()


In [18]:
# All the beta coefficients of all 4 features
lr.coef_

array([-4.12063772,  0.82222352, -0.08473696,  0.02170703])

In [19]:
# Intercept for this multiple linear regression
lr.intercept_

56839.48237766337