<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Job_class_info2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score

In [None]:
df = pd.read_csv('jobclassinfo2.csv')

In [None]:
df.head()

In [None]:
df = df.drop(['ID', 'PG'], axis = 1)

In [None]:
df.head()

In [None]:
for column in df:
  unique_values = np.unique(df[column])
  np_value = len(unique_values)
  if np_value < 12:
    print('The number of values for feature {} :{} -- {}'.format(column, np_value,unique_values))
  else:
    print('The number of values for feature {} :{}'.format(column, np_value))

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(12,9))
sns.heatmap(df.corr(), cmap='copper', annot = True )


In [None]:
df = df.drop(['JobFamilyDescription', 'JobClassDescription'], axis = 1)

In [None]:
df.head()

In [None]:
sns.countplot(x='PayGrade', data=df)

In [None]:
df.columns

In [None]:
features = ['JobFamily', 'JobClass', 'EducationLevel', 'Experience',
       'OrgImpact', 'ProblemSolving', 'Supervision', 'ContactLevel',
       'FinancialBudget']

In [None]:
for f in features:
  plt.figure(figsize=(12,10))
  ax = sns.countplot(x=f, data=df, hue='PayGrade', palette = 'icefire')

In [None]:
X = df.drop('PayGrade', axis = 1)
y = df['PayGrade']

X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = 7)
dt.fit(X_train, y_train)

print('The training accuracy is', dt.score(X_train, y_train))

print('The testing accuracy is', dt.score(X_test, y_test))

In [None]:
import graphviz 

dot_data = tree.export_graphviz(dt, out_file=None, 
    feature_names=df.drop('PayGrade', axis=1).columns,    
    class_names=df['PayGrade'].unique().astype(str),  
    filled=True, rounded=True,  
    special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
for i, column in enumerate(df.drop('PayGrade', axis=1)):
    print('Importance of feature {}:, {:.3f}'.format(column, dt.feature_importances_[i]))
    
    fi = pd.DataFrame({'Variable': [column], 'Feature Importance Score': [dt.feature_importances_[i]]})
    
    try:
        final_fi = pd.concat([final_fi,fi], ignore_index = True)
    except:
        final_fi = fi
        
        
# Ordering the data
final_fi = final_fi.sort_values('Feature Importance Score', ascending = False).reset_index()            
final_fi

In [None]:
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True)
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
y_pred = dt.predict(X_train)

# Plotting Confusion Matrix
cm = confusion_matrix(y_train, y_pred)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=dt.classes_, title='Training confusion')

In [None]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X=X_test)

# source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Accuracy on Test
print("Training Accuracy is: ", rf.score(X_train, y_train))
# Accuracy on Train
print("Testing Accuracy is: ", rf.score(X_test, y_test))

# Confusion Matrix
cm = confusion_matrix(y_test, prediction_test)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=rf.classes_)

In [None]:
from itertools import product
n_estimators = 100
max_features = [1, 'sqrt', 'log2']
max_depths = [None, 2, 3, 4, 5]
for f, d in product(max_features, max_depths): # with product we can iterate through all possible combinations
    rf = RandomForestClassifier(n_estimators=n_estimators, 
                                criterion='entropy', 
                                max_features=f, 
                                max_depth=d, 
                                n_jobs=2,
                                random_state=1337)
    rf.fit(X_train, y_train)
    prediction_test = rf.predict(X=X_test)
    print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))
    cm = confusion_matrix(y_test, prediction_test)
    cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
    plt.figure()
    plot_confusion_matrix(cm_norm, classes=rf.classes_,
    title='Confusion matrix accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
print("Training Accuracy is: ", lr.score(X_train, y_train))
# Accuracy on Train
print("Testing Accuracy is: ", lr.score(X_test, y_test))

In [None]:
from sklearn import metrics

y_pred = lr.predict(X_test)
print("Accuracy of Logistic Regression model is:",
metrics.accuracy_score(y_test, y_pred))