# Linear Regression - Example: Salary based on Years of experience

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("Salary_Data.csv")

In [5]:
df

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


### Splitting dataset in to input and output

In [6]:
x = df.iloc[:,0].values.reshape(-1,1)  #go with negative index we use -1, does the same job

In [7]:
x

array([[ 1.1],
       [ 1.3],
       [ 1.5],
       [ 2. ],
       [ 2.2],
       [ 2.9],
       [ 3. ],
       [ 3.2],
       [ 3.2],
       [ 3.7],
       [ 3.9],
       [ 4. ],
       [ 4. ],
       [ 4.1],
       [ 4.5],
       [ 4.9],
       [ 5.1],
       [ 5.3],
       [ 5.9],
       [ 6. ],
       [ 6.8],
       [ 7.1],
       [ 7.9],
       [ 8.2],
       [ 8.7],
       [ 9. ],
       [ 9.5],
       [ 9.6],
       [10.3],
       [10.5]])

In [8]:
y = df.iloc[:,1].values

### Train Data

In [9]:
from sklearn.linear_model import LinearRegression #import library needed to do linear regression using Ordinary Least Square method

In [10]:
Lin = LinearRegression()  #create an object of the LinearRegression class

In [11]:
Lin.fit(x,y)

LinearRegression()

### Prediction

In [12]:
Lin.predict([[3.5]])

array([58867.06832376])

### Splitting of data in to Test and Train sets

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=0) #split is 80 - 20, random_state to fix the random pattern, just like setting a seed

In [21]:
x_test.shape

(6, 1)

In [22]:
x_train.shape

(24, 1)

In [23]:
x_test

array([[ 1.5],
       [10.3],
       [ 4.1],
       [ 3.9],
       [ 9.5],
       [ 8.7]])

### Training model only with 80% of data

In [24]:
Lin.fit(x_train,y_train)

LinearRegression()

### Prediction with Test data

In [25]:
y_pred = Lin.predict(x_test)

In [26]:
y_pred #Predicted value of test data

array([ 40748.96184072, 122699.62295594,  64961.65717022,  63099.14214487,
       115249.56285456, 107799.50275317])

In [27]:
y_test   #actual value of test data

array([ 37731., 122391.,  57081.,  63218., 116969., 109431.])

In [28]:
x_test

array([[ 1.5],
       [10.3],
       [ 4.1],
       [ 3.9],
       [ 9.5],
       [ 8.7]])

### Error evaluation

In [29]:
#Manual way
y_pred - y_test

array([ 3017.96184072,   308.62295594,  7880.65717022,  -118.85785513,
       -1719.43714544, -1631.49724683])

In [30]:
abs(y_pred - y_test)

array([3017.96184072,  308.62295594, 7880.65717022,  118.85785513,
       1719.43714544, 1631.49724683])

In [31]:
abs(y_pred - y_test).mean()

2446.1723690465055

In [32]:
#using library
from sklearn.metrics import mean_absolute_error

In [33]:
mean_absolute_error(y_test,y_pred)

2446.1723690465055

In [34]:
from sklearn.metrics import r2_score

In [35]:
r2_score(y_test,y_pred)

0.988169515729126

In [36]:
#A very good model

### Prediction

In [37]:
Lin.predict([[3.5]])

array([59374.11209418])

In [38]:
# So we see that there is a 2.5K error(2446.17). so we can say that we can give  60K +/- 2.5K for a person with 3.5 years of experience based on our model

In [47]:
Lin.predict([[5.5]])

array([77999.26234764])

# Save model

In [43]:
import pickle

In [44]:
with open('model_pickle',"wb") as file:
    pickle.dump(Lin,file)

In [45]:
from sklearn.externals import joblib

ImportError: cannot import name 'joblib' from 'sklearn.externals' (/usr/local/lib/python3.7/site-packages/sklearn/externals/__init__.py)

In [46]:
joblib.dump(Lin,'model_joblib')

NameError: name 'joblib' is not defined

In [None]:
###after that you will get a joblib file and pickle file
###You can downlad them