In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
import datetime as dt

In [2]:
data= pd.read_csv("./Dataset/Salary Prediction of Data Professions.csv")

In [3]:
data.head()

Unnamed: 0,FIRST NAME,LAST NAME,SEX,DOJ,CURRENT DATE,DESIGNATION,AGE,SALARY,UNIT,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP
0,TOMASA,ARMEN,F,5-18-2014,01-07-2016,Analyst,21.0,44570,Finance,24.0,6.0,2.0,0
1,ANNIE,,F,,01-07-2016,Associate,,89207,Web,,13.0,,7
2,OLIVE,ANCY,F,7-28-2014,01-07-2016,Analyst,21.0,40955,Finance,23.0,7.0,3.0,0
3,CHERRY,AQUILAR,F,04-03-2013,01-07-2016,Analyst,22.0,45550,IT,22.0,8.0,3.0,0
4,LEON,ABOULAHOUD,M,11-20-2014,01-07-2016,Analyst,,43161,Operations,27.0,3.0,,3


In [4]:
x = data.drop(['FIRST NAME','LAST NAME','SALARY'], axis = 1)
y = data['SALARY']

In [5]:
x.shape, y.shape

((2639, 10), (2639,))

In [6]:
x['SEX'].nunique(), x['SEX'].unique() # check how many to encode
x = pd.get_dummies(x, columns=['SEX', 'DESIGNATION', 'UNIT'])

In [7]:
type(x['LEAVES REMAINING'][0])

numpy.float64

In [8]:
x['DOJ'] = pd.to_datetime(x['DOJ'])
x['CURRENT DATE'] = pd.to_datetime(x['CURRENT DATE'])

In [9]:
x['DOJ'].fillna(x['DOJ'].mean(), inplace=True)
x['CURRENT DATE'].fillna(x['CURRENT DATE'].mean(), inplace=True)
x['AGE'].fillna(x['AGE'].mean(), inplace=True)
x['RATINGS'].fillna(round(x['RATINGS'].mean()), inplace=True)
x['LEAVES USED'].fillna(30 - x['LEAVES REMAINING'], inplace=True)
x['LEAVES REMAINING'].fillna(30 - x['LEAVES USED'], inplace=True)

In [10]:
x['DOJ'] = x['DOJ'].map(dt.datetime.toordinal)
x['CURRENT DATE'] = x['CURRENT DATE'].map(dt.datetime.toordinal)

In [11]:
x.head()

Unnamed: 0,DOJ,CURRENT DATE,AGE,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP,SEX_F,SEX_M,DESIGNATION_Analyst,...,DESIGNATION_Director,DESIGNATION_Manager,DESIGNATION_Senior Analyst,DESIGNATION_Senior Manager,UNIT_Finance,UNIT_IT,UNIT_Management,UNIT_Marketing,UNIT_Operations,UNIT_Web
0,735371,735970,21.0,24.0,6.0,2.0,0,True,False,True,...,False,False,False,False,True,False,False,False,False,False
1,735169,735970,24.756449,17.0,13.0,3.0,7,True,False,False,...,False,False,False,False,False,False,False,False,False,True
2,735442,735970,21.0,23.0,7.0,3.0,0,True,False,True,...,False,False,False,False,True,False,False,False,False,False
3,734961,735970,22.0,22.0,8.0,3.0,0,True,False,True,...,False,False,False,False,False,True,False,False,False,False
4,735557,735970,24.756449,27.0,3.0,3.0,3,False,True,True,...,False,False,False,False,False,False,False,False,True,False


In [12]:
X_train, X_test, y_train, y_test= train_test_split(x,y, train_size=0.8, random_state=0, shuffle=True)

In [13]:
X_train.head()

Unnamed: 0,DOJ,CURRENT DATE,AGE,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP,SEX_F,SEX_M,DESIGNATION_Analyst,...,DESIGNATION_Director,DESIGNATION_Manager,DESIGNATION_Senior Analyst,DESIGNATION_Senior Manager,UNIT_Finance,UNIT_IT,UNIT_Management,UNIT_Marketing,UNIT_Operations,UNIT_Web
39,735555,735970,24.0,23.0,7.0,2.0,2,False,True,True,...,False,False,False,False,False,True,False,False,False,False
1503,735109,735970,21.0,22.0,8.0,4.0,0,False,True,True,...,False,False,False,False,False,False,False,False,False,True
743,734883,735970,21.0,19.0,11.0,5.0,0,True,False,True,...,False,False,False,False,False,False,False,False,True,False
2329,735521,735970,23.0,26.0,4.0,3.0,1,False,True,True,...,False,False,False,False,False,False,True,False,False,False
2265,735219,735970,25.0,19.0,11.0,2.0,2,False,True,True,...,False,False,False,False,False,False,False,False,True,False


In [14]:
##linear regression model

def Linear_regression(x_train,y_train,x_test,y_test):  
    lr=LinearRegression()
    lr.fit(x_train,y_train)
    pred=lr.predict(x_test)
    return pred

LR_predicted = Linear_regression(X_train,y_train,X_test,y_test)         ##calling function

In [15]:
## function to calculate r2_score of predicted values
def r2_score_(actual_value,predicted_value):
    r2=r2_score(actual_value, predicted_value)
    return r2

r2_score_(y_test,LR_predicted)

0.9562999496726552

In [16]:
#grid search cv function

def grid_search(estimator,parameters,x,y):
    grid= GridSearchCV(estimator,param_grid=parameters)
    grid.fit(x,y)
    return grid.best_params_

In [17]:
parameters_decision_tree={
    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth':[2,4,8,10,None],
    'min_samples_split' : [2,4,8,10,20],
    'max_leaf_nodes' : [4, 8, 10,15,20],
    'splitter':['best','random'],
    'max_features':['auto','sqrt','log2'],
    'min_weight_fraction_leaf':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}

#insert parameters and estimator in grid search function
grid_search(DecisionTreeRegressor(),parameters_decision_tree,X_train,y_train)

60000 fits failed out of a total of 135000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15000 fits failed with the following error:
Traceback (most recent call last):
  File "/home/firas_bt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/firas_bt/anaconda3/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/home/firas_bt/anaconda3/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/firas_bt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_cons

{'criterion': 'squared_error',
 'max_depth': 2,
 'max_features': 'auto',
 'max_leaf_nodes': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.1,
 'splitter': 'best'}

In [18]:
def Decision_tree_regressor_model(x_train,y_train,x_test,y_test):
    tree=DecisionTreeRegressor(criterion= 'squared_error', max_depth=2, max_features='auto', 
                                max_leaf_nodes= 4, min_samples_split = 2,
                                min_weight_fraction_leaf =  0.1,splitter='best')
    tree.fit(x_train,y_train)
    pred =tree.predict(x_test)
    return pred                           
                               
DT_predicted = Decision_tree_regressor_model(X_train,y_train,X_test,y_test)



In [19]:
r2_score_(y_test,DT_predicted)

0.6761912526940972

In [20]:
parameters_random_forest={
    'n_estimators':[50,100],
    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth':[2,4,8,10,None],
    'min_samples_split' : [2,4,8,10,20],
    'max_leaf_nodes' : [4, 8, 10,15,20],
    'max_features':['sqrt','log2']
}

grid_search(RandomForestRegressor(),parameters_random_forest,X_train,y_train)

In [None]:
def random_forest_regressor(x_train,y_train,x_test,y_test):
    forest=RandomForestRegressor(n_estimators=50,criterion= 'absolute_error', max_depth=8,
                                 max_features='sqrt', max_leaf_nodes= 20, 
                                 min_samples_split = 4)
    forest.fit(x_train,y_train)
    pred =forest.predict(x_test)
    return pred 

RF_predicted=random_forest_regressor(X_train,y_train,X_test,y_test)

In [None]:
r2_score_(y_test,RF_predicted)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111,projection="3d")
ax.scatter(data['UNIT'],data['DESIGNATION'],data['SALARY'])
plt.show()

In [None]:
k_range = range(1, 10)   #considering a range of 10 clusters

# Compute the sum of squared distances (SSE) for each value of k
sse = []
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(x)
    sse.append(kmeans.inertia_)

In [None]:
plt.plot(k_range, sse)
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.title('Elbow chart for KMeans clustering')
plt.show()

In [None]:
models=[]  #initialize empty array to store models
R_squared_scores=[]
def linear_regression_on_clusters(x,y):
    
    for i in range(3):
        cluster_index= x[x['cluster'] == i].index      # getting the indexes of where cluster is i
        X = x.loc[cluster_index][['age', 'exp']]       # X representing age,exp features where data  belongs to cluster of i
        Y = y.loc[cluster_index]                       # Y representing target variable 'salary' where data belongs to cluster of i
        X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)  #split data for train and test 
        model = LinearRegression().fit(X_train,Y_train)
        r2=r2_score_(Y_test,model.predict(X_test))      # find R_squared score
        models.append(model)
        R_squared_scores.append(r2)
        

linear_regression_on_clusters(x,y)      #x and y are independent and dependent feature we splitted initially  