## Project 4 : Job Market Analysis
## Notebook 04: Predective Modelling - Classification Analysis

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

In [2]:
final_jobs = pd.read_csv('final_jobs.csv', index_col = [0])

In [3]:
final_jobs.head();

## Giving the range for salaries:
Here I am giving a rank to the salaries as there are very few observations left for analysis with a range staring from 40000 to 500000

In [4]:
def salary_Rank(salary_range):
    if salary_range <= 40000:
        return 0
    elif salary_range > 40000 and salary_range <= 120000:
        return 1
    elif salary_range > 120000 and salary_range <= 180000:
        return 2
    else:
        return 3

In [5]:
final_jobs['salary_rank'] = final_jobs['final_salary'].map(lambda x : salary_Rank(x))

# Baseline accuracy

In [6]:
final_jobs.salary_rank.value_counts(normalize =True)
print('Baseline accuracy is:', final_jobs.salary_rank.value_counts(normalize =True))

Baseline accuracy is: 3    0.366142
1    0.327559
2    0.306299
Name: salary_rank, dtype: float64


## Coding dummy variables:
Creating dummy variables for the categorical features so I can use them for moddeling

In [7]:
final_jobs_subset = final_jobs[['new_job_category','new_job_title','experience_level','job_location','type_of_work']]

In [8]:
final_jobs_subset.head();

In [9]:
final_cols = pd.get_dummies(final_jobs_subset, drop_first=True, prefix=None)

In [10]:
final_cols.columns;

## Standardizing the DataFrame

In [11]:
X = final_cols
y = final_jobs['salary_rank']

# Standardizing the dataframe
Standarzing the data will make the dataset ready for modelling

In [12]:
ss = StandardScaler()
Xn = ss.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## Splitting the DataFrame to train and test

Here I split my data into a train and test set. I will be training all my models on the training set and testing on the test set. This will give me the best idea of how well generalized my model is. I opted with a training set size of 70%.

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 20)

In [14]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(889, 13) (889,)
(381, 13) (381,)


## Modelling:

# Logistic Regression:

In [15]:
logreg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
logreg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
y_pred = logreg.predict(X_test)
print("The accuracy score of train:", logreg.score(X_train, y_train)*100)
print("The accuracy score of test:", logreg.score(X_test, y_test)*100)

The accuracy score of train: 75.47806524184477
The accuracy score of test: 75.32808398950131


In [17]:
logreg.coef_;

## Precision and recall

In [18]:
Prec_score = metrics.classification_report(y_test, y_pred)
print(Prec_score)

              precision    recall  f1-score   support

           1       0.73      0.76      0.75       127
           2       0.72      0.67      0.70       119
           3       0.80      0.81      0.81       135

   micro avg       0.75      0.75      0.75       381
   macro avg       0.75      0.75      0.75       381
weighted avg       0.75      0.75      0.75       381



## Random Forest Classification

In [24]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=42)
clf.fit(X, y)
clf.predict(X_test)
print("The accuracy score of train:", clf.score(X_train, y_train)*100)
print("The accuracy score of test:", clf.score(X_test, y_test)*100)

The accuracy score of train: 71.65354330708661
The accuracy score of test: 72.17847769028872


In [20]:
print(clf.feature_importances_)

[0.0105468  0.19480575 0.03090298 0.04670903 0.08229094 0.02054389
 0.01999971 0.00154971 0.00435975 0.00512396 0.3535372  0.22862125
 0.00100902]


In [21]:
# list of the features and their importance scores
list_features = list(zip(X_train, clf.feature_importances_ ))

In [22]:
list_features

[('new_job_category_Gov_Services', 0.010546796192119769),
 ('new_job_category_Information Technology', 0.19480575259113656),
 ('new_job_category_Others', 0.030902981040653102),
 ('new_job_category_Sales & Marketing', 0.046709029547904776),
 ('new_job_title_data analyst', 0.08229094039575054),
 ('new_job_title_data manager', 0.020543894107030832),
 ('new_job_title_data scientist', 0.019999713233578485),
 ('experience_level_Mid_level', 0.0015497065609627336),
 ('experience_level_Senior_level', 0.00435975173795949),
 ('job_location_All-Sydney-NSW', 0.0051239631446125785),
 ('type_of_work_Contract/Temp', 0.3535372030700916),
 ('type_of_work_Full Time', 0.22862125263146105),
 ('type_of_work_Part Time', 0.0010090157467385027)]

# Observation:

The models I used for classification are Logistic Regression and Random Forest. Below are few observations:
1. Both the models are giving better results.
2. The precision and Recall for 3 says that 80% of the observations are true positives.
3. The top impacters are almost the same as predicted in regression model: 
    - Work Type (Contract/Temp, Full Time)
    - Category(Industry) (Information Technology)
    - Job Title (Data Analyst)