In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
attrition=pd.read_csv('Employee_Attrition.csv')
attrition

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [None]:
avo.info()

In [None]:
avo=avo.drop(columns=avo.columns.values[0],axis=1)
avo.info()

In [None]:
avo.Date=avo.Date.astype('datetime64[ns]')
avo.info()

In [None]:
avo.type.value_counts()

In [None]:
avo.type=avo.type.map({
    'conventional': 0,
    'organic': 1
    }).astype('int64')
avo.info()

In [None]:
avo['year']=avo.Date.dt.year
avo['month']=avo.Date.dt.month
avo=avo.drop(columns=['Date'],axis=1)
avo.info()

## Classification to predict the region

In [None]:
avo_classification=avo.copy()
le=LabelEncoder()
avo_classification.region=le.fit_transform(avo_classification.region)
avo_classification.info()

In [None]:
X,y=avo_classification.drop(columns=['region'],axis=1),avo_classification.region
X_train,X_test,y_train,y_test=train_test_split(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               stratify=y,
                                               random_state=42)

In [None]:
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
# Random forest model
rfc=RandomForestClassifier(random_state=42)
cv_score=cross_val_score(estimator=rfc,
                        X=X_train_scaled,
                        y=y_train,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1).mean()
rfc.fit(X=X_train_scaled,
        y=y_train)
test_score=rfc.score(X=X_test_scaled,
                     y=y_test)
print(f'Cross val score = {cv_score}')
print(f'Test score = {test_score}')
print(f'Score diff = {abs(cv_score-test_score)}')

In [None]:
# Gradient boosting model
gbc=GradientBoostingClassifier(random_state=42)
cv_score=cross_val_score(estimator=gbc,
                        X=X_train_scaled,
                        y=y_train,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1).mean()
gbc.fit(X=X_train_scaled,
        y=y_train)
test_score=gbc.score(X=X_test_scaled,
                     y=y_test)
print(f'Cross val score = {cv_score}')
print(f'Test score = {test_score}')
print(f'Score diff = {abs(cv_score-test_score)}')

In [None]:
## Final model is the Random Forest model
feature_importance=pd.Series(data=rfc.feature_importances_,
                             index=X.columns.values).sort_values()
plt.figure(figsize=(10,6))
feature_importance.plot.barh();

In [None]:
y_pred=rfc.predict(X_test_scaled)
print(classification_report(y_pred=y_pred,
                            y_true=y_test))

## Regression to predict the price

In [None]:
avo_reg=avo.copy()

In [None]:
avo_reg=pd.concat([avo_reg.drop(columns=['region'],axis=1),pd.get_dummies(avo.region,
                                  drop_first=True).astype('int64')], 
                                  axis=1,
                                  join='inner')
avo_reg

In [None]:
X,y=avo_reg.drop(columns=['AveragePrice'],axis=1),avo_reg.AveragePrice

X_train,X_test,y_train,y_test=train_test_split(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               random_state=42)
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
# Random forest model
rfr=RandomForestRegressor(random_state=42)
cv_score=cross_val_score(estimator=rfr,
                         X=X_train_scaled,
                         y=y_train,
                         scoring='neg_mean_absolute_error',
                         cv=10,
                         n_jobs=-1).mean()*-1
rfr.fit(X=X_train_scaled,
        y=y_train)
test_score=mean_absolute_error(y_pred=rfr.predict(X=X_test_scaled),
                               y_true=y_test)
print(f'Cross val score = {cv_score}')
print(f'Test score = {test_score}')
print(f'Score diff = {abs(cv_score-test_score)}')

In [None]:
# Gradient boosting model
gbr=GradientBoostingRegressor(random_state=42)
cv_score=cross_val_score(estimator=gbr,
                         X=X_train_scaled,
                         y=y_train,
                         scoring='neg_mean_absolute_error',
                         cv=10,
                         n_jobs=-1).mean()*-1
gbr.fit(X=X_train_scaled,
        y=y_train)
test_score=mean_absolute_error(y_pred=gbr.predict(X=X_test_scaled),
                               y_true=y_test)
print(f'Cross val score = {cv_score}')
print(f'Test score = {test_score}')
print(f'Score diff = {abs(cv_score-test_score)}')

In [None]:
## Final regression model is the Random Forest

feature_importance=pd.Series(data=rfr.feature_importances_,
                             index=X.columns.values).sort_values(ascending=False)
plt.figure(figsize=(15,6))
feature_importance.plot.bar();
y_pred=rfr.predict(X_test_scaled)

In [None]:
selected_features=feature_importance.cumsum()[feature_importance.cumsum()<=0.95].index.values
selected_features

In [None]:
X=X.copy()[selected_features]
X_train,X_test,y_train,y_test=train_test_split(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               random_state=42)

scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
# Random forest model
rfr=RandomForestRegressor(random_state=42)
cv_score=cross_val_score(estimator=rfr,
                         X=X_train_scaled,
                         y=y_train,
                         scoring='neg_mean_absolute_error',
                         cv=10,
                         n_jobs=-1).mean()*-1
rfr.fit(X=X_train_scaled,
        y=y_train)
test_score=mean_absolute_error(y_pred=rfr.predict(X=X_test_scaled),
                               y_true=y_test)
print(f'Cross val score = {cv_score}')
print(f'Test score = {test_score}')
print(f'Score diff = {abs(cv_score-test_score)}')

In [None]:
feature_importance=pd.Series(data=rfr.feature_importances_,
                             index=X.columns.values).sort_values(ascending=False)
plt.figure(figsize=(12,6))
feature_importance.plot.bar();
y_pred=rfr.predict(X_test_scaled)