### Use-case: An HR firm has hired you as a Data Scientist. Your job is to create and deploy a prediction model that can predict the salary of the employee based on his/her years or experience

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Salary_Data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
YearsExperience    30 non-null float64
Salary             30 non-null float64
dtypes: float64(2)
memory usage: 560.0 bytes


In [4]:
# Data preprocessing not required since dataset is complete

In [5]:
#Seperate your data as features and label
# feature --- YearsExperience
# label ----- Salary
#
# Your label is a continous numeric data, we can try regression algo to create our model

In [6]:
# Since you are going to use SKLEARN for linear Regression
# 1. Sklearn expects your data to be complete
# 2. Sklearn expects your data to be numeric
# 3. Sklearn expects your data to be in the form of features and label
# 4. For Regression, Sklearn expects your features and label both to be 2d numpy array
# 5. For Regression, data must be free from Outliers

In [7]:
#Perform outlier check -- Homework

In [8]:
features = data.iloc[:,[0]].values
label = data.iloc[:,[1]].values

In [10]:
features.ndim

2

In [14]:
# Deal with missing data -- Not required for this dataset

In [15]:
# Deal with categorical data -- Not required for this dataset

In [47]:
#Experimental Trial and Error using Looping
from sklearn.model_selection import train_test_split

for i in range(1,30):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size= 0.2,
                                                random_state = i)
    
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    training_score = model.score(X_train,y_train)
    testing_score = model.score(X_test,y_test)
    
    if testing_score > training_score:
        print("Testing : {} Training : {} Random State: {}".format(testing_score,training_score,i))
    
    

Testing : 0.9695039421049821 Training : 0.9545249190394052 Random State: 3
Testing : 0.9631182154839475 Training : 0.9528197369259258 Random State: 8
Testing : 0.9816423482070255 Training : 0.9494673013344644 Random State: 10
Testing : 0.9606215790278543 Training : 0.9527636176933665 Random State: 14
Testing : 0.9835849730044817 Training : 0.9460054870434312 Random State: 26
Testing : 0.9636425773684422 Training : 0.9527636606684406 Random State: 27


In [48]:
# Create train Test split
# Seperating your data as training set and testing set where 
# training set will be used for model training
# testing set will be used for model testing

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size= 0.2,
                                                random_state = 26) #Define in which order the data must be picked

In [49]:
#Create Model
# Step1: Initialize the algo
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Step2: Learning Phase
model.fit(X_train,y_train) #b0 and b1 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [50]:
#Check the Quality of the model
#
# 1. Check whether the model is generalized or not
#     testScore > trainScore
#    Generalization ensures your model performs well with Unknown data
#
# 2. Check whether the accuracy score is satisfactory for deployment or not

In [51]:
model.score(X_train,y_train)

0.9460054870434312

In [52]:
model.score(X_test,y_test)

0.9835849730044817

In [53]:
#Approved

In [54]:
#Equation
model.intercept_

array([26594.41811015])

In [55]:
model.coef_

array([[9393.91275928]])

In [56]:
#Equation of line is 
#Salary = 26594.41811015 + 9393.91275928 (YearsExperience)

#What will be the salary of fresher?
# Rs.26594.41

In [57]:
model.predict(np.array([[0]]))

array([[26594.41811015]])

In [59]:
# User input
yexp = float(input("Enter Years of Experience: "))
sal = model.predict(np.array([[yexp]]))
print("The salary for {} years of experience is {}".format(yexp,sal))

Enter Years of Experience: 45
The salary for 45.0 years of experience is [[449320.49227779]]


In [60]:
#Final Deployment
import pickle #To dump the memory object and use it we use Pickle
                        #file Name               write in bytes
pickle.dump(model, open('HRSalaryPredictor.mdl', 'wb'))

In [61]:
model.coef_

array([[9393.91275928]])

In [62]:
model.intercept_

array([26594.41811015])