# Multivariate Adaptive Regression Splines (MARS)

# EXAMPLE

![](mul_var_example.png)

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [3]:
df = pd.read_csv('newhomeprices.csv')
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [4]:
# Data Preprocessing: Fill NA values with median value of a column
df.bedrooms.median()

4.0

In [5]:
df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [6]:
reg = linear_model.LinearRegression()
reg.fit(df.drop('price',axis='columns'),df.price)

LinearRegression()

In [7]:
#OR
reg = linear_model.LinearRegression()
reg.fit(df[['area','bedrooms','age']],df.price)

LinearRegression()

In [8]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [9]:
reg.intercept_

221323.0018654043

#### Find price of home with 3000 sqr ft area, 3 bedrooms, 40 year old

In [10]:
reg.predict([[3000, 3, 40]])



array([498408.25158031])

In [11]:
112.06244194*3000 + 23388.88007794*3 + -3231.71790863*40 + 221323.00186540384

498408.25157402386

#### Find price of home with 2500 sqr ft area, 4 bedrooms, 5 year old

In [12]:
reg.predict([[2500, 4, 5]])



array([578876.03748933])

In [13]:
112.06244194*2500 + 23388.88007794*4 + -3231.71790863*5 + 221323.00186540384

578876.0374840139

# EXERCISE

### In exercise folder there is hiring.csv. This file contains hiring statics for a firm such as experience of candidate, his written test score and personal interview score. Based on these 3 factors, HR will decide the salary. Given this data, you need to build a machine learning model for HR department that can help them decide salaries for future candidates. Using this predict salaries for following candidates,

2 yr experience, 9 test score, 6 interview score

12 yr experience, 10 test score, 10 interview score

In [12]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [13]:
df = pd.read_csv('hiring.csv')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [14]:
df.experience = df.experience.fillna("zero")
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [15]:
# new concept
# numeric data is needed for calculation so string to num
from word2number import w2n
df.experience = df.experience.apply(w2n.word_to_num)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [16]:
import math
median_test_score = math.floor(df['test_score(out of 10)'].mean())
median_test_score

7

In [17]:
df['test_score(out of 10)'] = df['test_score(out of 10)'].fillna(median_test_score)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


In [18]:
reg = linear_model.LinearRegression()
reg.fit(df[['experience','test_score(out of 10)','interview_score(out of 10)']],df['salary($)'])
# Output:LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

LinearRegression()

In [19]:
reg.coef_

array([2922.26901502, 2221.30909959, 2147.48256637])

In [20]:
reg.intercept_

14992.651446693126

#### 2 yr experience, 9 test score, 6 interview score

In [None]:
reg.predict([[2,9,6]])
# Answer: array([ 53713.86677124])

In [22]:
# salary = m1*experience + m2*test_score + m3*interview_score + b(intercept) 
2922.26901502*2 + 2221.30909959*9 + 2147.48256637*6 + 14992.651446693126

53713.86677126313

#### 12 yr experience, 10 test score, 10 interview score

In [None]:
reg.predict([[12,10,10]])
# Answer: array([ 93747.79628651])

In [23]:
# salary = m1*experience + m2*test_score + m3*interview_score + b(intercept) 
2922.26901502*12 + 2221.30909959*10 + 2147.48256637*10 + 14992.651446693126

93747.79628653314

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   experience                  8 non-null      int64  
 1   test_score(out of 10)       8 non-null      float64
 2   interview_score(out of 10)  8 non-null      int64  
 3   salary($)                   8 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 384.0 bytes


In [25]:
df.describe()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
count,8.0,8.0,8.0,8.0
mean,4.75,7.75,7.875,63000.0
std,4.26782,1.28174,1.642081,11501.55269
min,0.0,6.0,6.0,45000.0
25%,1.5,7.0,6.75,57500.0
50%,4.0,7.5,7.5,63500.0
75%,7.75,8.25,9.25,70500.0
max,11.0,10.0,10.0,80000.0
