## Multiple Linear Regression

- regression having one dependent variable and multiple independent variables (>2)
- multivariate dataset

#### import required packages

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### load the data

In [13]:
df = pd.read_csv('50_Startups.csv')
print(df.head())

         RnD  Administration  Marketing       State     Profit
0  165349.20       136897.80  471784.10    New York  192261.83
1  162597.70       151377.59  443898.53  California  191792.06
2  153441.51       101145.55  407934.54     Florida  191050.39
3  144372.41       118671.85  383199.62    New York  182901.99
4  142107.34        91391.77  366168.42     Florida  166187.94


#### EDA

In [4]:
print(df.describe())

                 RnD  Administration      Marketing         Profit
count      50.000000       50.000000      50.000000      50.000000
mean    73721.615600   121344.639600  211025.097800  112012.639200
std     45902.256482    28017.802755  122290.310726   40306.180338
min         0.000000    51283.140000       0.000000   14681.400000
25%     39936.370000   103730.875000  129300.132500   90138.902500
50%     73051.080000   122699.795000  212716.240000  107978.190000
75%    101602.800000   144842.180000  299469.085000  139765.977500
max    165349.200000   182645.560000  471784.100000  192261.830000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RnD             50 non-null     float64
 1   Administration  50 non-null     float64
 2   Marketing       50 non-null     float64
 3   State           50 non-null     object 
 4   Profit          50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


#### data cleansing process

In [7]:
# since there is no missing value, we dont have to replace any of them
print(df.isna().sum())

RnD               0
Administration    0
Marketing         0
State             0
Profit            0
dtype: int64


In [8]:
# since state columns is not numeric, we have to convert the text values to numeric ones
state_unique_values = df['State'].unique()
print(state_unique_values)

['New York' 'California' 'Florida']


#### replacement logic

we have following categories
- 'New York' 
- 'California' 
- 'Florida'

replace every category with one numeric representation
- 'New York' will be replaced with **1**
- 'California' will be replaced with **2**
- 'Florida' will be replaced with **3**

In [17]:
# since the values are pretty small we can decide the replacement
# replacement_values = [1, 2, 3]

# but if the unique values are pretty having large set of values then create the replacement array dynamically
replacement_values = np.arange(1, len(state_unique_values) + 1)

# replace the state values with replacement_values

# replaces the state values with numeric values and returns a new dataset
# df = df.replace(state_unique_values, replacement_values)

# replace the state values with numeric ones in the same dataset
df.replace(state_unique_values, replacement_values, inplace=True)

print(df.head())

         RnD  Administration  Marketing  State     Profit
0  165349.20       136897.80  471784.10      1  192261.83
1  162597.70       151377.59  443898.53      2  191792.06
2  153441.51       101145.55  407934.54      3  191050.39
3  144372.41       118671.85  383199.62      1  182901.99
4  142107.34        91391.77  366168.42      3  166187.94


In [15]:
corr = df.corr()
print(corr)

                     RnD  Administration  Marketing     State    Profit
RnD             1.000000        0.241955   0.724248  0.037930  0.972900
Administration  0.241955        1.000000  -0.032154  0.003026  0.200717
Marketing       0.724248       -0.032154   1.000000  0.137777  0.747766
State           0.037930        0.003026   0.137777  1.000000  0.048471
Profit          0.972900        0.200717   0.747766  0.048471  1.000000


#### preparing the dataset

In [18]:
# decide the x and y
x = df.drop(['Profit', 'State'], axis=1)
y = df['Profit']

In [20]:
# split the data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=123456)