In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import shap
import pandas_profiling as pp
%matplotlib inline


In [None]:
data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
dtest = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
data.head()

In [None]:
data.head()

In [None]:
num_null = {}
percent_null = {}
nul_list = []
num = data.shape[0]
for col in data.columns.tolist():
    col_null = data[col].isnull().sum()
    if col_null > 0:
        num_null[col] = col_null
        percent_null[col] = round(col_null/num,2)
        nul_list.append(col)
num_ = pd.Series(num_null)
per_ = pd.Series(percent_null)
con = pd.concat([num_,per_],axis=1).rename(columns={0:"null_numm",1:"null_percent"})

cm = sns.light_palette("#348498", as_cmap=True)
print("{} columns have missing values.".format(data.isnull().any().sum()))
print("--------------------------------------")
con.style.background_gradient(cmap=cm,subset=con.columns.tolist())

####   1. The company type has the largest missing value, with a missing percentage of 32%.
####   2. The company size has the largest missing value, with a missing ratio of 31%.

In [None]:
#test data  
nu_null = {}
percen_null = {}
nu = dtest.shape[0]
test_nulist = []
for col in dtest.columns.tolist():
    col_nul = dtest[col].isnull().sum()
    if col_nul > 0:
        nu_null[col] = col_nul
        percen_null[col] = round(col_nul/nu,2)
        test_nulist.append(col)

nu_ = pd.Series(nu_null)
pe_ = pd.Series(percen_null)
co = pd.concat([nu_,pe_],axis=1).rename(columns={0:"nul_numm",1:"nul_percent"})

c = sns.light_palette("#348498", as_cmap=True)
print("{} columns have missing values.".format(dtest.isnull().any().sum()))
print("--------------------------------------")
co.style.background_gradient(cmap=cm,subset=co.columns.tolist())

In [None]:
#data_train     null columns ->unique
for i in nul_list:
    print(i+'\t-->unique:',data[i].unique())

In [None]:
#data_test null columns ->unique
for i in test_nulist:
    print(i+'\t-->unique:',dtest[i].unique())

In [None]:
cat_list = data.select_dtypes(include='object').columns.tolist()[1:]
fig,ax = plt.subplots(3,3,figsize=(24,24))
with sns.axes_style({'axes.edgecolor': '#6b778d','ytick.color': "#6b778d",'text.color': '#6b778d','axes.labelcolor': '#6b778d','xtick.color': '#6b778d'}):
    with sns.plotting_context({'ytick.labelsize':10,'legend.fontsize': 10,'axes.labelsize': 12}):
        for var,ax in zip(cat_list,ax.flat):
            sns.countplot(data=data,x=var,hue="target",palette=sns.color_palette("pastel"),edgecolor=sns.color_palette("dark", 3),ax=ax)
            ax.set_title(var+"_target",color="#17223b")
            sns.despine(bottom=True)

#### 1. According to the first picture, we can see that the proportion of men who are not looking for a job change far exceeds the proportion of men who are looking for a job change.
#### 2. In the second picture, we can see that most candidates with relevant experience do not look for job changes in a large proportion.
#### 3. in the types of registered courses, most people are not registered for courses and are not willing to look for job changes.
#### 4. Most of these groups have a high degree of education.
#### 5. The candidate's major is basically STEM. This shows that many people are not changing industries.
#### 6. In the group that does not change their jobs, many people have more than 20 years of work experience. This can actually explain in disguise that the longer you work, the more you hope you can stabilize. In contrast, those with less work experience will have a significantly higher rate of changing jobs.
#### 7. Among the groups that do not plan to change jobs, the number of their employer companies is basically between 50-500.
#### 8. Among the groups that do not change their jobs, the type of employer is basically pv ltd
#### 9. The proportion of unchanged jobs exceeds the proportion of changed jobs, and employees who have just joined the company for about a year are less willing to change jobs.

In [None]:
with sns.axes_style({'axes.edgecolor': '#6b778d','ytick.color': "#6b778d",'text.color': '#6b778d','axes.labelcolor': '#6b778d','xtick.color': '#6b778d'}):
    with sns.plotting_context({'ytick.labelsize':10,'legend.fontsize': 10,'axes.labelsize': 12}):
        plt.figure(figsize=(10,6))
        sns.histplot(data=data,x="training_hours",hue="target",kde=True,color="#30A2DA")
        plt.title("train_hours",color="#17223b")
        sns.despine(bottom=True)

#### Obviously, the training time variable is not normally distributed. It has a right skew. 

#### Finally, let us look at the data situation of the target variable

In [None]:
plt.figure(figsize=(10,6))
fig = sns.countplot(data=data,x="target",palette=sns.color_palette("pastel"),edgecolor=sns.color_palette("dark", 3))
sns.despine(bottom=True)

#### We can see that the number of people who do not plan to change jobs is the largest, and the data is indeed unbalanced.

# <br>Data preprocessing
 1. Fill or delete rows with missing values
 2. Categorical feature and numerical feature processing
 3. Oversample the data
 4. Considering the problem of data leakage, the data set should be divided first, and then the numerical features should be normalized. ->I don't know if this is correct。

### 1. Fill or delete rows with missing values -- train and test data

In [None]:
# missing columns  --['gender','enrolled_university','education_level','major_discipline','experience','company_size','company_type','last_new_job']
#drop gender(other)  fill null 
# data["gender"] = data["gender"].fillna("Male")
# dtest["gender"] = dtest["gender"].fillna("Male")
#drop enrollde_university
# data = data.drop(data.loc[data["enrolled_university"] == "NaN"].index)
# dtest = dtest.drop(dtest.loc[dtest["enrolled_university"] == "NaN"].index)
#fill education_level
#education_level	-->unique: ['Graduate' 'High School' 'Masters' nan 'Phd' 'Primary School']
# data["education_level"] = data["education_level"].fillna("Graduate")
# dtest["education_level"] = dtest["education_level"].fillna("Graduate")
#major_discipline	-->unique: ['STEM' nan 'Other' 'Business Degree' 'Arts' 'Humanities' 'No Major']
# data["major_discipline"] = data["major_discipline"].fillna("Other")
# dtest["major_discipline"] = dtest["major_discipline"].fillna("Other")
#experience	-->unique: ['9' '5' '<1' '11' '>20' '10' '14' '3' '20' '8' '4' '13' '2' '6' '7' '1''19' '15' '16' nan '17' '18' '12'
# data["experience"] = data["experience"].fillna(0)
# dtest["experience"] = dtest["experience"].fillna(0)
#company_size	-->unique: ['<10' nan '10/49' '10000+' '100-500' '50-99' '1000-4999' '500-999''5000-9999']
#company_type	-->unique: [nan 'Pvt Ltd' 'Funded Startup' 'Other' 'Public Sector''Early Stage Startup' 'NGO']
#last_new_job	-->unique: ['1' 'never' '>4' '2' '4' '3' nan]
data = data.drop(data.loc[data["gender"] == "Other"].index)
dtest = dtest.drop(dtest.loc[dtest["gender"] == "Other"].index)
data = data.dropna()
dtest = dtest.dropna()

In [None]:
data = data.drop(columns=["city","enrollee_id"])
dtest = dtest.drop(columns=["city"])

### 2.Categorical feature and numerical feature processing

In [None]:
def cate_one(df,*strname):
    strname = list(strname)
    dataset = df.copy()
    getdum = pd.get_dummies(dataset[strname])
    dataset = dataset.drop(strname,axis=1)
    dataset = pd.concat([dataset,getdum],axis=1)
    
    return dataset

data = cate_one(data,"gender")
dtest = cate_one(dtest,'gender')

data.head()

In [None]:
relev = {"No relevent experience":0,"Has relevent experience":1}
enrol = {"no_enrollment":0,"Part time course":1,"Full time course":2}

#education_level	-->unique: ['Graduate' 'Masters' 'High School' nan 'Phd' 'Primary School']
edu = {"Primary School":0,"High School":1,"Graduate":2,"Masters":3,"Phd":4}

#experience	-->unique: ['>20' '15' '5' '<1' '11' '13' '7' '17' '2' '16' '1' '4' '10' '14' '18''19' '12' '3' '6' '9' '8' '20' ]
expre = {">20":21,"<1":0}

#company_size	-->unique: ['<10' nan '10/49' '10000+' '100-500' '50-99' '1000-4999' '500-999''5000-9999']
comsize = {"<10":0,"10/49":1,"50-99":2,"100-500":3,"500-999":4,"1000-4999":5,"5000-9999":6,"10000+":7}

#last_new_job	-->unique: ['1' '>4' 'never' '4' '3' '2' nan]
las = {"never":0,"1":1,"2":2,"3":3,"4":4,">4":5}

data["relevent_experience"] = data["relevent_experience"].map(relev)
dtest["relevent_experience"] = dtest["relevent_experience"].map(relev)

data["enrolled_university"] = data["enrolled_university"].map(enrol)
dtest["enrolled_university"] = dtest["enrolled_university"].map(enrol)

data["education_level"] = data["education_level"].map(edu)
dtest["education_level"] = dtest["education_level"].map(edu)

data["experience"] = data["experience"].replace(expre)
dtest["experience"] = dtest["experience"].replace(expre)
data["experience"] = data["experience"].map(lambda x:float(x))
dtest["experience"] = dtest["experience"].map(lambda x:float(x))

data["company_size"] = data["company_size"].map(comsize)
dtest["company_size"] = dtest["company_size"].map(comsize)

data["last_new_job"] = data["last_new_job"].map(las)
dtest["last_new_job"] = dtest["last_new_job"].map(las)

#major_discipline	-->unique: ['STEM' nan 'Other' 'Business Degree' 'Arts' 'Humanities' 'No Major']
#company_type	-->unique: [nan 'Pvt Ltd' 'Funded Startup' 'Other' 'Public Sector','Early Stage Startup' 'NGO']
data["major_discipline"] = data["major_discipline"].map(lambda x:len(x))
data["company_type"] = data["company_type"].map(lambda x:len(x))

dtest["major_discipline"] = dtest["major_discipline"].map(lambda x:len(x))
dtest["company_type"] = dtest["company_type"].map(lambda x:len(x))

data.head()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True,color="b")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import recall_score,precision_score,ConfusionMatrixDisplay,roc_auc_score,confusion_matrix

In [None]:
X = data.drop(columns=["target"])
Y = data["target"]
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.25,random_state=123)

In [None]:
#step 1
xtrain["experience"] = (xtrain["experience"] - xtrain["experience"].mean())/xtrain["experience"].std()
xtest["experience"] = (xtest["experience"] - xtest["experience"].mean())/xtest["experience"].std()
dtest["experience"] = (dtest["experience"] - dtest["experience"].mean())/dtest["experience"].std()
#step 2
xtrain["training_hours"] = (xtrain["training_hours"] - xtrain["training_hours"].mean())/xtrain["training_hours"].std()
xtest["training_hours"] = (xtest["training_hours"] - xtest["training_hours"].mean())/xtest["training_hours"].std()
dtest["training_hours"] = (dtest["training_hours"] - dtest["training_hours"].mean())/dtest["training_hours"].std()

xtrain

In [None]:
X = data.drop('target', axis=1)
y = data['target']

In [None]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy='not majority')
X_sm , y_sm = smote.fit_resample(X,y)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X_sm, y_sm, test_size=0.15, random_state=42)

#### LogisticRegressionClassifier

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr=LogisticRegression(solver='liblinear', random_state=0)
lr.fit(xtrain,ytrain)
lrpre = lr.predict(xtest)
lrcm = confusion_matrix(ytest,lrpre)
print('recall_score:',recall_score(ytest,lrpre))
print('precision_score:',precision_score(ytest,lrpre))
print('roc_auc_score:',roc_auc_score(ytest,lrpre))
lrdis = ConfusionMatrixDisplay(lrcm)
lrdis.plot()
plt.show()
lr.feature_importances_

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, plot_confusion_matrix,classification_report

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_xtrain = scaler.fit_transform(xtrain)
scaled_xtest = scaler.transform(xtest)

In [None]:
model_lr = LogisticRegression()
model_lr.fit(scaled_xtrain, ytrain)

In [None]:
pred_lr = model_lr.predict(scaled_xtest)

In [None]:
accuracy_score(ytest,pred_lr)

In [None]:
print(classification_report(ytest,pred_lr))

#### 1、DecisionTreeClassifier

In [None]:
de = DecisionTreeClassifier(random_state=123)
de.fit(xtrain,ytrain)
depre = de.predict(xtest)
decm = confusion_matrix(ytest,depre)
print('recall_score:',recall_score(ytest,depre))
print('precision_score:',precision_score(ytest,depre))
print('roc_auc_score:',roc_auc_score(ytest,depre))
dis = ConfusionMatrixDisplay(decm)
dis.plot()
plt.show()
de.feature_importances_

#### 2.RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=123)
rf.fit(xtrain,ytrain)
rfpre = rf.predict(xtest)
rfcm = confusion_matrix(ytest,rfpre)
print('recall_score:',recall_score(ytest,rfpre))
print('precision_score:',precision_score(ytest,rfpre))
print('roc_auc_score:',roc_auc_score(ytest,rfpre))
rfdis = ConfusionMatrixDisplay(rfcm)
rfdis.plot()
plt.show()
rf.feature_importances_

In [None]:
model = RandomForestClassifier()
model.fit(xtrain, ytrain)
print(classification_report(ytest, model.predict(xtest)))

#### 3.GradientBoostingClassifier

In [None]:
gb = GradientBoostingClassifier(random_state=123)
gb.fit(xtrain,ytrain)
gbpre = gb.predict(xtest)
gpcm = confusion_matrix(ytest,gbpre)
print('recall_score:',recall_score(ytest,gbpre))
print('precision_score:',precision_score(ytest,gbpre))
print('roc_auc_score:',roc_auc_score(ytest,gbpre))
gpdis = ConfusionMatrixDisplay(gpcm)
gpdis.plot()
plt.show()
gb.feature_importances_

#### 4.AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier(random_state=123)
ada.fit(xtrain,ytrain)
adpre = ada.predict(xtest)
adcm = confusion_matrix(ytest,adpre)
print('recall_score:',recall_score(ytest,adpre))
print('precision_score:',precision_score(ytest,adpre))
print('roc_auc_score:',roc_auc_score(ytest,adpre))
addis = ConfusionMatrixDisplay(adcm)
addis.plot()
plt.show()
ada.feature_importances_

#### 5.MLClassifier

In [None]:
ml = MLPClassifier(hidden_layer_sizes=(100,50,20),random_state=123)
ml.fit(xtrain,ytrain)
mlpre = ada.predict(xtest)
mlcm = confusion_matrix(ytest,mlpre)
print('recall_score:',recall_score(ytest,mlpre))
print('precision_score:',precision_score(ytest,mlpre))
print('roc_auc_score:',roc_auc_score(ytest,mlpre))
mldis = ConfusionMatrixDisplay(mlcm)
mldis.plot()
plt.show()

#### 6.XGBClassifier

In [None]:
use_label_encoder=False

In [None]:
xg = XGBClassifier(random_state=123)
xg.fit(xtrain,ytrain)
xgpre = ada.predict(xtest)
xgcm = confusion_matrix(ytest,xgpre)
print('recall_score:',recall_score(ytest,xgpre))
print('precision_score:',precision_score(ytest,xgpre))
print('roc_auc_score:',roc_auc_score(ytest,xgpre))
xgdis = ConfusionMatrixDisplay(xgcm)
xgdis.plot()
plt.show()

In [None]:
testpre = xg.predict(dtest.drop(columns=["enrollee_id"]))
sub = pd.DataFrame()
sub["enrollee_id"] = dtest["enrollee_id"]
sub["target"] = testpre
sub.head()
