# Multiple Linear Regression


In multiple linear regression we will teach machine to predict the values of dependent variable from two or more independent variables.

### Import required packages

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

### Import dataset

In [68]:
dataset = pd.read_csv('salaryData.csv')
dataset.head(10)

Unnamed: 0,Depatment,WorkedHours,Certification,YearsExperience,Salary
0,Development,2300,0,1.1,39343
1,Testing,2100,1,1.3,46205
2,Development,2104,2,1.5,37731
3,UX Designer,1200,1,2.0,43525
4,Testing,1254,2,2.2,39891
5,UX Designer,1236,1,2.9,56642
6,Development,1452,2,3.0,60150
7,Testing,1789,1,3.2,54445
8,UX Designer,1645,1,3.2,64445
9,UX Designer,1258,0,3.7,57189


In [214]:
# select all column except last one salary - X - independent variable such as dept, workhours, cretication, yearsexp
X = dataset.iloc[:, :-1].values 
#y independet variable - salary - target variable selected here
y = dataset.iloc[:, 4].values
X[:, 0]

array(['Development', 'Testing', 'Development', 'UX Designer', 'Testing',
       'UX Designer', 'Development', 'Testing', 'UX Designer',
       'UX Designer', 'Testing', 'Development', 'Development', 'Testing',
       'UX Designer', 'Development', 'Testing', 'UX Designer',
       'Development', 'Development', 'Testing', 'Testing', 'UX Designer',
       'Development', 'UX Designer', 'UX Designer', 'Testing',
       'Development', 'Testing', 'UX Designer'], dtype=object)

### Data preprocessing

The process of converiting data from initial raw format into another format for futher analysis,

Already we know about data preprocessing, for more details refer Data analysis with python ==> data-wrangling subfolder. now we going to use labelencoder and onhotencoder for data preprocessing.

#### Below link is example for Labelencoder and onehotencoder

https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

### Data preprocessing for our model

In [215]:
# Encoding categorical data
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0]) #categories[i] holds the categories expected in the ith column.
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0.000e+00, 0.000e+00, 2.300e+03, 0.000e+00, 1.100e+00],
       [1.000e+00, 0.000e+00, 2.100e+03, 1.000e+00, 1.300e+00],
       [0.000e+00, 0.000e+00, 2.104e+03, 2.000e+00, 1.500e+00],
       [0.000e+00, 1.000e+00, 1.200e+03, 1.000e+00, 2.000e+00],
       [1.000e+00, 0.000e+00, 1.254e+03, 2.000e+00, 2.200e+00],
       [0.000e+00, 1.000e+00, 1.236e+03, 1.000e+00, 2.900e+00],
       [0.000e+00, 0.000e+00, 1.452e+03, 2.000e+00, 3.000e+00],
       [1.000e+00, 0.000e+00, 1.789e+03, 1.000e+00, 3.200e+00],
       [0.000e+00, 1.000e+00, 1.645e+03, 1.000e+00, 3.200e+00],
       [0.000e+00, 1.000e+00, 1.258e+03, 0.000e+00, 3.700e+00],
       [1.000e+00, 0.000e+00, 1.478e+03, 3.000e+00, 3.900e+00],
       [0.000e+00, 0.000e+00, 1.257e+03, 2.000e+00, 4.000e+00],
       [0.000e+00, 0.000e+00, 1.596e+03, 1.000e+00, 4.000e+00],
       [1.000e+00, 0.000e+00, 1.256e+03, 2.000e+00, 4.100e+00],
       [0.000e+00, 1.000e+00, 1.489e+03, 3.000e+00, 4.500e+00],
       [0.000e+00, 0.000e+00, 1.236e+03,

#### Splitting the dataset into the Training set and Test set

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

#### Fitting Multiple Linear Regression to the Training set

In [166]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Predicting the Test set results

In [156]:
y_pred = regressor.predict(X_test)
y_pred

array([ 42964.11902737, 123643.56179465,  63125.10291096,  60848.83188279,
       114352.88900925, 107050.56131194, 119216.00885373,  65415.04017263,
        75198.60674123,  99513.60553401])

### Testing our model  by passing own data input

In [224]:
# passing independent variables as supported format
x_own_test_data = np.array([["Development", 1452, 2, 4.0], ["Testing", 1452, 3, 4.0], ['UX Designer', 2100, 1, 1.3]], dtype=object)

# data preprocessing 
# Encoding categorical data
#labelencoder
labelencoder_own_test_data = LabelEncoder()
x_own_test_data[:, 0] = labelencoder_own_test_data.fit_transform(x_own_test_data[:, 0])

# #onehotencoder
onehotencoder_own_test_data = OneHotEncoder(categorical_features = [0])
x_own_test_data = onehotencoder_own_test_data.fit_transform(x_own_test_data).toarray()
# Avoiding the Dummy Variable Trap
x_own_test_data = x_own_test_data[:, 1:]
x_own_test_data
# # x_own_test_data
y_pred2 = regressor.predict(x_own_test_data)
print("-----RESULT------")
print("Department, workedhours, cretification and years of experience", ["Development", 1452, 2, 4.0], "==>", y_pred2[0])
print("Department, workedhours, cretification and years of experience", ["Testing", 1452, 3, 4.0], "==>", y_pred2[1])
print("Department, workedhours, cretification and years of experience", ['UX Designer', 2100, 1, 1.3], "==>", y_pred2[2])
print("-----RESULT END------")

-----RESULT------
Department, workedhours, cretification and years of experience ['Development', 1452, 2, 4.0] ==> 65734.91726187432
Department, workedhours, cretification and years of experience ['Testing', 1452, 3, 4.0] ==> 61759.79504379228
Department, workedhours, cretification and years of experience ['UX Designer', 2100, 1, 1.3] ==> 38282.63565990284
-----RESULT END------


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Complete code (code praticed)

In [259]:
#importing required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#importing dataset
df = pd.read_csv('salaryData.csv')
# X_1 independent variable - selecting independent variables
X_1 = df.iloc[:, :-1].values
# y_1 dependent variable - select dependent variable
y_1 = df.iloc[:, 4].values

#Data preprocessing

#labelenvoder
label_encoder_1 = LabelEncoder()
X_1[:,0] = label_encoder_1.fit_transform(X_1[:, 0])

#Onehotencoder
onehotencoder_1 = OneHotEncoder(categorical_features=[0])
X_1 = onehotencoder_1.fit_transform(X_1).toarray()
#Avoid dummy variable trap
X_1 = X_1[:, 1:]

#train test split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=1/3, random_state=0)

#Linear regression
regression_1 = LinearRegression()
#fitline
regression_1.fit(X_train_1, y_train_1)

#prediction
ypred_1 = regression_1.predict(X_test)
ypred_1

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([ 42964.11902737, 123643.56179465,  63125.10291096,  60848.83188279,
       114352.88900925, 107050.56131194, 119216.00885373,  65415.04017263,
        75198.60674123,  99513.60553401])