## Multiple Linear Regression

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
import pandas as pd
path = r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/50_Startups.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


### Basic data quality checks

In [4]:
df.shape

(50, 5)

In [5]:
df.columns

Index(['RND', 'ADMIN', 'MKT', 'STATE', 'PROFIT'], dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [7]:
## Missing values
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [8]:
## Duplicates values
df.duplicated().sum()

np.int64(0)

### There are no missing values nor any duplicated rows in the data

## Separate X and Y features
    Y : PROFIT
    X : RND, MKT

In [10]:
X = df[['RND','MKT']]
Y = df[['PROFIT']]

In [11]:
X.head()

Unnamed: 0,RND,MKT
0,165349.2,471784.1
1,162597.7,443898.53
2,153441.51,407934.54
3,144372.41,383199.62
4,142107.34,366168.42


In [12]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


## Model Building
YPRED = B0 + B1X1 + B2X2

PROFIT_PREDICTED = B0 + B1RND + B2MKT

In [14]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,Y)

In [15]:
model.intercept_

array([46975.86422072])

In [16]:
model.coef_

array([[0.79658404, 0.02990788]])

PROFIT_PREDICTED = 46975.86 + 0.796*RND + 0.02*MKT

RND AND MKT IS 0 => PROFIT PREDICTED = 46975.86

RND=1,MKT=1 => PROFIT PREDICTED INCREASES BY 0.796 TIMES 

RND AND 0.02 TIMES MKT

In [17]:
ypreds = model.predict(X)
ypreds

array([[192800.45862502],
       [189774.65948019],
       [181405.37809703],
       [173441.30884249],
       [171127.62321762],
       [162879.31081217],
       [158028.13045422],
       [160455.73887656],
       [152317.8036728 ],
       [154343.8139353 ],
       [135011.91472396],
       [134638.87007529],
       [129218.39657898],
       [127812.20546461],
       [150192.49179713],
       [146032.71543309],
       [117025.89184753],
       [130829.44473222],
       [128882.19882756],
       [115816.41833283],
       [116650.89209156],
       [118384.17070857],
       [114990.38463925],
       [109886.18521692],
       [112552.18715137],
       [102612.90924225],
       [110990.79288437],
       [114978.60515008],
       [103125.01275975],
       [102440.42409014],
       [ 99085.21956154],
       [ 98314.54885378],
       [ 98864.66225433],
       [ 97600.73044466],
       [ 90262.64121898],
       [ 89776.4942853 ],
       [ 75824.23391247],
       [ 87974.01451829],
       [ 686

## Evaluate the model
    Mean Sqaured Error
    Mean Absolute Error
    RMSE
    R2 score- Coeff of determination

In [20]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [22]:
mse = mean_squared_error(Y,ypreds)
mae = mean_absolute_error(Y,ypreds)
rmse = mse**(1/2)
r2 = r2_score(Y,ypreds)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2*100:.2f}%")

MSE: 78887897.01
MAE: 6499.32
RMSE: 8881.89
R2 score: 95.05%


### Model can be used for final predictions since the r2 score is greater than 80%

In [23]:
xnew = [180000,205000]

In [24]:
profit_predicted = model.predict([xnew]).round(2)

In [27]:
print(f"For the given RND: {xnew[0]} and MKT: {xnew[1]}, Profit predicted by the model is {profit_predicted[0][0]}")

For the given RND: 180000 and MKT: 205000, Profit predicted by the model is 196492.11
