# Use-case 1:
An HR company has hired you as a data scientist to create a Model that can predict the salary of a prospective employee based on his/her years of experience. Relevant dataset is provided

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Load the relevant dataset
dataset = pd.read_csv('Salary_Data.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
YearsExperience    30 non-null float64
Salary             30 non-null float64
dtypes: float64(2)
memory usage: 560.0 bytes


In [4]:
dataset.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [5]:
#Split the data inot features and label
features = dataset.iloc[:,0].values
label = dataset.iloc[:,1].values

In [6]:
print(features.shape)
print(label.shape)

(30,)
(30,)


In [8]:
#Problem is all sklearn algo expects your data to be in minimum 2d format !!!!
features = features.reshape(-1,1)
label = label.reshape(-1,1)

In [9]:
print(features.shape)
print(label.shape)

(30, 1)
(30, 1)


In [10]:
#Create Training and Testing Set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=10)

In [12]:
#Create the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [13]:
model.fit(X_train,y_train)  #Passing my training data to create the model. In this case create the equation

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
#Test your model
y_pred = model.predict(X_test)
y_pred

array([[89715.76498848],
       [56031.05821174],
       [53223.99931368],
       [40124.39112273],
       [44802.8226195 ],
       [92522.82388655]])

In [15]:
y_test

array([[91738.],
       [54445.],
       [56642.],
       [37731.],
       [43525.],
       [98273.]])

In [16]:
#Check score of the model
model.score(X_train,y_train)

0.9494673013344644

In [17]:
model.score(X_test,y_test)

0.9816423482070255

In [None]:
#I can observe that the test score is greater than the train score. This means the model didn't memorized and understood the 
#pattern in a generalized fashion
#Thus my model is generalized and is a good model for deployment !!!!


#In case if the model is not generalized, try performing the following:
#1. Change the random state of train_test_split
#2. Try some other algorithms (Decision Tree Regressor, RandomForestRegressor, SVR, GradientDescent, ExtremeGradientDescent, 
#   XGBOOST)
#3. Increase the amount of data
#4. Normalize the data (-1,1)
#5. Standardize the data (mean=0,std=1)
#6. Try reducing number of features (EDA, PCA, LDA)



In [18]:
#Equation here is Salary = m (YearsExperience) + b
# Salary = 9356.86299354 (YearsExperience) + 26089.09663242
print(model.coef_) #slope
print(model.intercept_) #intercept

[[9356.86299354]]
[26089.09663242]


In [29]:
yearsExperience = np.array([[100],[1.3],[8],[7.5]])

In [30]:
yearsExperience.shape

(4, 1)

In [31]:
salary = model.predict(yearsExperience)

In [32]:
salary

array([[961775.39598635],
       [ 38253.01852402],
       [100944.00058073],
       [ 96265.56908396]])

In [37]:
import pickle #converting variable in memory into a file (like a snapshot)
pickle.dump(model , open('HRSalaryPredictor.model','wb'))
