# JOB CHANGE OF DATA SCIENTISTS

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pipeline and column transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

# Data transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


# Data splitter and model evaluator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve

# Learning models (use one of them or any other model)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Ensemble learning models
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor 

# Pipeline and column transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

# Performance metrics
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

ModuleNotFoundError: No module named 'xgboost'

In [None]:
import os
os.getcwd()

In [None]:
df_train = pd.read_csv('aug_train.csv')

In [None]:
df_train

In [None]:
df_test = pd.read_csv('aug_test.csv')

In [None]:
df_test

In [None]:
#lets print full summary of dataframe
df_train.info()
# we see dataframe has 19,157 rows, 14 columns, data type in each column, and number of non-null values in each column

In [None]:
# we have separate training and test data sets. lets combine train and test together to do common feature engineering
train_replica = df_train.copy()
test_replica = df_test.copy()

In [None]:
# set up a flag field to distinguish records from training and testing sets in the combined dataset
train_replica['tst'] = 0
test_replica['tst'] = 1

In [None]:
# combine training and testing data into a single dataframe to do uniform part of feature engineering
combined_data = pd.concat([train_replica, test_replica], axis=0, sort=True)
del train_replica
del test_replica

In [None]:
combined_data

In [None]:
from numpy import nan
from numpy import isnan
values=combined_data.values
imputer= SimpleImputer(missing_values=nan, strategy='most_frequent')
transformed_values=imputer.fit_transform(values)

# Feature Categorization

In [None]:
#Nominal: gender, enrolled_university, major_discipline, company_type,   
#Ordinal: company_size, education_level, last_new_job, relevent_experience
#Numerical: experience, training_hours

# Lets begin by handling NaN missing values in ALL columns

In [None]:
mode_values=combined_data [['gender', 'enrolled_university', 'major_discipline', 'company_type']].mode()
print(mode_values)

In [None]:
#Lets fill all NaN values in ALL COLUMNS(imputing)
imputer = SimpleImputer(missing_values = np.nan,  
                        strategy ='most_frequent') 
combined_data.gender=imputer.fit_transform(combined_data['gender']. values.reshape(-1,1))[:,0]
combined_data.relevent_experience=imputer.fit_transform(combined_data['relevent_experience']. values.reshape(-1,1))[:,0]
combined_data.education_level=imputer.fit_transform(combined_data['education_level']. values.reshape(-1,1))[:,0]
combined_data.major_discipline=imputer.fit_transform(combined_data['major_discipline']. values.reshape(-1,1))[:,0]
combined_data.experience=imputer.fit_transform(combined_data['experience']. values.reshape(-1,1))[:,0]
combined_data.company_size=imputer.fit_transform(combined_data['company_size']. values.reshape(-1,1))[:,0]
combined_data.company_type=imputer.fit_transform(combined_data['company_type']. values.reshape(-1,1))[:,0]
combined_data.last_new_job=imputer.fit_transform(combined_data['last_new_job']. values.reshape(-1,1))[:,0]
combined_data.training_hours=imputer.fit_transform(combined_data['training_hours']. values.reshape(-1,1))[:,0]
combined_data.enrolled_university=imputer.fit_transform(combined_data['enrolled_university']. values.reshape(-1,1))[:,0]
combined_data.target=imputer.fit_transform(combined_data['target']. values.reshape(-1,1))[:,0]

In [None]:
combined_data.isnull().sum()

# Checking list of all values in columns so we can code them 

In [None]:
combined_data.gender.unique()

In [None]:
combined_data.relevent_experience.unique()

In [None]:
combined_data.enrolled_university.unique()

In [None]:
combined_data.major_discipline.unique()

In [None]:
combined_data.company_type.unique()

In [None]:
combined_data.last_new_job.unique()

# One Hot Encoding of Nominal Variables (company_type, enrolled_university, gender, major_discipline)

In [None]:
# we need to use label encoder before we use on-hot encoding. Label endocer will convert string values to numerical values.
# we need to encode every categorical feature separately, meaning we need as many encoders as categorical features. 
#Let’s loop over all categorical features and build a dictionary that will map a feature to its encoder:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# For each categorical column
# We fit a label encoder, transform our column and add it to our new dataframe

nom_columns = ["gender", "enrolled_university", "company_type", "major_discipline"]
label_encoders = {}
for col in nom_columns:
    print("Encoding {}".format(col))
    new_le = LabelEncoder()
    combined_data[col] = new_le.fit_transform(combined_data[col])
    label_encoders[col] = new_le

In [None]:
# Now that we have label encoded nominal features; we can do one hot encoding. 
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit_transform( combined_data[['gender','enrolled_university','company_type','major_discipline']] ).toarray()

In [None]:
combined_data.drop(['city', 'city_development_index', 'enrollee_id'], axis=1, inplace=True)

# Mapping Ordinal Features 
# (company_size, education_level, last_new_job, relevent_experience)

In [None]:
# Ordinal features are company_size, education_level, last_new_job, relevent_experience
# Lets find unique values in each of these columns so we can map them.
# Oridnal featues are mapped; one-hot-encoding is not done on oridnal features

In [None]:
combined_data.relevent_experience.unique()

In [None]:
combined_data.education_level.unique()

In [None]:
combined_data.company_size.unique()

In [None]:
combined_data.last_new_job.unique()

In [None]:
#Mapping ordinal features. Creating new columns for ordinal features. 
relevent_experience_map= {'Has relevent experience': 1, 'No relevent experience':2}
combined_data['relevent_experience_ordinal'] = combined_data.relevent_experience.map(relevent_experience_map)

education_level_map = {'Primary School': 1, 'High School':2, 'Masters':3, 'Graduate':4, 'Phd':5}
combined_data['education_level_ordinal'] = combined_data.education_level.map(education_level_map)

company_size_map = {'<10': 1, '10/49':1, '50-99':2, '100-500':3, '500-999':4, '1000-4999':5, '5000-9999':6, '10000+':7}
combined_data['company_size_ordinal'] = combined_data.company_size.map(company_size_map)

last_new_job_map = {'never': 0, '1':1, '2':2, '3':3, '4':4, '>4':5}
combined_data['last_new_job_ordinal'] = combined_data.last_new_job.map(last_new_job_map)



# Cleaning columns with continuous data

In [None]:
#In column experience managing the < and > signs
combined_data['experience'] = combined_data['experience'].replace(['>20', '<1'], ['20', '1'])
combined_data

# Defining predictor and traget variables. Splitting data into test and training

In [None]:
# Define Predictors and Target Variable
nom_col = ['gender','enrolled_university', 'major_discipline', 'company_type'] 
ord_col = ['company_size_ordinal', 'education_level_ordinal', 'last_new_job_ordinal', 'relevent_experience_ordinal']
num_col = ['experience', 'training_hours']   

X= combined_data[nom_col + ord_col + num_col]
y= combined_data['target']

# Splitting Data into Training Set and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    stratify=y, 
                                                    random_state=4
                                                   )

# Checking for balanced or imbalanced data (since its a classification prob)

In [None]:
print(combined_data['target'].value_counts())

Our dataset is imbalanced. For imbalanced dataset performance metrics will be Confusion Matrix, 
Precicion, Recall, and F1 Score. We use Accuracy only when dataset is balanced (50/50 or 60/40 difference)

In [None]:
Recall = TP/(TP+FN)      
Out of total actual positive values how many positive did we predict correctly (TPR or Sensitivity)
When you want to reduce FN (eg cancer or not) use Recall.

Precision = TP/ (TP+FP)  
Out of total actual predicted positive rsults how many results were actual positive (Positive Prediction Value)  
When your FP is important (you want to reduce it, eg spam or not) value use Precision

If FP and FN are noth important then use weighted average of FP and Fn which is called F1 SCORE.

# Standardizing data

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# Making pipeline branches. Creating the main pipe.

In [None]:
# In the following block we experiment with different models. 
# The one with closely matching training accuracy and test accuracy will be used for hyperparameter tuning.
# If training accuracy is much higher than validation/test accuracy that means model suffers from HIGH VARIANCE. 
# HIGH VARAINCE= OVERFITTING = model fits training data perfectly but does not do a good job with out of model data
# If test accuracy is signifcantly higher than training accuracy mlodel suffers from HIGH BIAS.
# HIGH BIAS= UNDERFITTING = decision boundary is very simple thus fails to capture important relations.

In [None]:
# Branch for nominal features
nom_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
                         OneHotEncoder(handle_unknown='ignore')
                        )
# Branch for ordinal features
ord_pipe = make_pipeline(SimpleImputer(strategy='median'),
                         StandardScaler()
                        )
# Branch for numerical features
num_pipe = make_pipeline(SimpleImputer(strategy='mean'),
                         MinMaxScaler()
                        )
# Make the main pipe, in which a column transformer sends columns into relevent pipes
pipe = make_pipeline(ColumnTransformer( [ ('nom', nom_pipe, nom_col),
                                          ('ord', ord_pipe, ord_col),
                                          ('num', num_pipe, num_col) ] ),
                     #PCA(n_components=3),
                     #LDA(n_components=8),
                     #SVC(kernel='rbf', C=1000, gamma=1)
                     #LogisticRegression(solver='lbfgs', C=0.01)
                     DecisionTreeClassifier(criterion='gini', max_depth=3)
                     #RandomForestClassifier(criterion='gini', n_estimators=20, random_state=1)
                     #KNeighborsClassifier(n_neighbors=10, p=2)
                    )
pipe.fit(X_train,y_train)

print('Training score:', pipe.score(X_train,y_train))
print("Test accuracy: ", pipe.score(X_test, y_test))

# Evaluating Model Performance : Confusion Matrix & ROC AUC

In [None]:
def get_perfomance_details(y_pred,y_test):
    #Classification report
    print ("Classification Report:\n")
    print (classification_report(y_true=y_test,y_pred=y_pred))
    
 #Draw confusion matrix
sns.reset_defaults()
plt.figure(figsize=(7,4));
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,cmap="GnBu",fmt="g",cbar=False);
plt.title("Confusion Matrix");
plt.show()
    
   

In [None]:
 #Find the Area under the curve
print ("\n----------------\nAOC ROC details\n----------------\n")
rocauc_score=roc_auc_score(y_pred,y_test)
    
#ROC curve
fpr,tpr,_=roc_curve(y_test,y_pred)
roc_aoc=auc(fpr,tpr)
print (f"AUC score: {rocauc_score}\nTrue positive rate: {tpr}\nFalse postive rate: {fpr}")


#Draw the ROC curve
plt.figure(figsize=(4,4));
lw=2
plt.plot(fpr,tpr,
            color='green',
            lw=lw,
            label='ROC curve (area=%0.4f)' % roc_aoc);
    
#plot diagonal line  from (0,0) to (1,1), represents fpt=tpr
plt.plot([0,1],[0,1],color='lightgrey',lw=lw,linestyle='--');
plt.xlim([0.0,1.0]);
plt.ylim([0.0,1.0]);
plt.xlabel("False Positive Rate");
plt.ylabel("True Postive Rate");
plt.title("Reciever operating characteristic for training data");
plt.legend(loc='lower right')
plt.show()

In [None]:
# Learning Curve: to see if model perfirmance can get better by collecting more samples

# Performance Metrics for the Model

In [None]:
print('\nMeasuring performance using log loss (the lower the better):')
print('Training set log loss:', log_loss(y_train, pipe.predict_proba(X_train)))
print('Test set     log loss:', log_loss(y_test,  pipe.predict_proba(X_test)))

print('\nMeasuring performance using roc_auc  (the higher the better: highest possible is 1, random guess is 0.5):')
print('Training set roc auc:', roc_auc_score(y_train, pipe.predict_proba(X_train)[:,1]))
print('Test set     roc auc:', roc_auc_score(y_test,  pipe.predict_proba(X_test) [:,1]))

# Hyperparameter Tuning & Validation Curve

In [None]:
# A learning curve plots the score over varying numbers of training samples, while a validation curve plots the score over 
# a varying hyper parameter. The learning curve is a tool for finding out if an estimator would benefit from more data, 
#or if the model is too simple (biased). If the training curve and validation curves converge that means this classifier 
# would hardly benefit from adding more training data; a more expressive model may be more appropriate.

# The validation curve is a tool for finding good hyper parameter settings. 
# Some hyper parameters (number of neurons in a neural network, maximum tree depth in a decision tree, 
# amount of regularization, etc.) control the complexity of a model. 
# We want the model to be complex enough to capture relevant information in the training data but not too complex to avoid 
# overfitting.

In [None]:
#param_name  = 'svc__gamma'
#param_range = np.logspace(-5, 1, 13)
#param_name  = 'logisticregression__C'
#param_range = np.logspace(-4, 2, 13)
param_name  = 'decisiontreeclassifier__max_depth'
param_range = np.arange(1,15)
#param_name  = 'randomforestclassifier__max_depth'
#param_range = np.arange(1,40)
#param_name  = 'kneighborsclassifier__n_neighbors'
#param_range = np.arange(1,26,2)


#scoring='r2'           # for regression problems
#scoring='accuracy'     # for classification problems with balanced target varaiable values
#scoring='neg_log_loss' # for classification problems
scoring='roc_auc'      # for classification problems



train_scores, val_scores = validation_curve(estimator=pipe, X=X_train, y=y_train, 
                                            cv=10,
                                            param_name=param_name, 
                                            param_range=param_range,
                                            scoring = scoring
                                            )

trn_mean = np.mean(train_scores, axis=1)
trn_std  = np.std (train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std  = np.std (val_scores, axis=1)

plt.figure(figsize=(12,6))
plt.plot(param_range, trn_mean, 'bo-',  markersize=5, label='training accuracy')
plt.fill_between(param_range, trn_mean+trn_std, trn_mean-trn_std, alpha=0.25, color='blue')

plt.plot(param_range, val_mean, 'gs--', markersize=5, label='validation accuracy')
plt.fill_between(param_range, val_mean+val_std, val_mean-val_std, alpha=0.15, color='green')

plt.grid()
#plt.xscale('log')  # Use this only when param_range = np.logspace(...). Comment this out otherwise.
plt.legend(loc='upper center', fontsize=14)
plt.xlabel(param_name, fontsize=14)
plt.ylabel(scoring, fontsize=14)
#plt.savefig('val_curve')
plt.show()

# Learning Curve: to see if model can benefit from collecting more samples

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(estimator=pipe, X=X_train, y=y_train,
                                                        train_sizes=np.linspace(0.03, 1.0, 5),
                                                        cv=5,
                                                        scoring=scoring
                                                       )
train_mean= np.mean(train_scores, axis=1)
train_std = np.std (train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std  = np.std (test_scores, axis=1)

plt.figure(figsize=(12,6))
plt.plot(train_sizes, train_mean, 'bo-', markersize=5, label='training '+ scoring)
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.25, color='blue')

plt.plot(train_sizes, test_mean, 'gs--', markersize=5, label='validation '+ scoring)
plt.fill_between(train_sizes, test_mean + test_std,  test_mean - test_std,   alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples', fontsize=14)
plt.ylabel(scoring, fontsize=14)
plt.legend(loc='best', fontsize=14)
# plt.savefig('learning_curve', dpi=300)
plt.show()

# Confusion Matrix 

In [None]:
# This is an unbalanced data set. 
# We will use confucion matrix to see TP and TN. We will aim to reduce FP and FN.

In [None]:
np.set_printoptions(precision=2)
display_labels= [0,1]

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = confusion_matrix(pipe, X_test, y_test,
                            display_labels= [0,1],
                            cmap=plt.cm.Blues,
                            normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
precision_score(y_train, y_train_pred)
recall_score(y_train_5, y_train_pred)