In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import json
import warnings 
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
plt.style.use('seaborn')
%matplotlib inline

# Case Study-1

# Quick Summary

Goal: To correctly tag for job vacancies.
    
Procedure: Being new to json files, it took me a while to get the data and start 
with the actual analysis. Started with exploratory data analysis to get the data into data frames. 
I followed the intuition of getting  the data into data frames and then apply ML algorithm to get tags for 
each object and finally combining them. Multilabel classification was applied to get the tags. I tried different 
models and settled with KNeighborsClassifier as it is the most simplistic approach. Based on the limited time for 
the project only this model was tested on the data.


# Exploratory data analysis

In [2]:
#Looking into the sample submission file
sam=pd.read_json('sample_submission.json')
sam

Unnamed: 0,workplaces,positions,subjects
0,[Secondary],[Cover Supervisor],[]
1,[Secondary],[Other Support Positions],[]
2,[Secondary],[Teacher],[Humanities]
3,[Secondary],[Teacher],[English]
4,[Adult Education],[Tutor],[Art and Design]
5,[Secondary],[Teacher],[Design and Technology]
6,[Special Education],"[Teaching Assistant, Learning Support]",[Education]
7,[Secondary],[Teacher],[Modern Languages]
8,[Secondary],[Teacher],[Art and Design]
9,[Secondary],[Teacher],[Mathematics]


In [3]:
#Loading json data
df=pd.read_json('train.json')
df=pd.DataFrame(df)
df

Unnamed: 0,title,description,positions,subjects,workplaces
0,Senior Assistant Director of Science,<h3><strong>Senior Assistant Director of Scien...,"[{'id': '70024', 'description': 'Deputy/Assist...","[{'id': '10549', 'description': 'Science', 'ca...","[{'id': '10510', 'description': 'Secondary', '..."
1,Teacher of Business Studies and HSC,<p>Temporary Teacher of Business and Health an...,"[{'id': '10804', 'description': 'Teacher', 'ca...","[{'id': '10520', 'description': 'Business Stud...","[{'id': '10510', 'description': 'Secondary', '..."
2,School Business Manager,<p><strong>Position: &nbsp;&nbsp;Business Mana...,"[{'id': '10803', 'description': 'Business Mana...",[],"[{'id': '110510', 'description': 'Secondary', ..."
3,EYFS Classroom Teacher,<p><strong>We require an EYFS teacher for Sept...,"[{'id': '10804', 'description': 'Teacher', 'ca...",[],"[{'id': '110509', 'description': 'Primary', 'c..."
4,Teacher,<p>The Federation of Sacred Heart and St Mary’...,"[{'id': '10804', 'description': 'Teacher', 'ca...",[],"[{'id': '110509', 'description': 'Primary', 'c..."
...,...,...,...,...,...
90402,Higher Education Coordinator,"<p>William Morris Sixth Form, situated in Hamm...","[{'id': '70041', 'description': 'Advisor/Consu...",[],"[{'id': '10499', 'description': 'Further Educa..."
90403,Pastoral Support Co-ordinator,"<p>Aspire Academy Bexley, Endeavour Academy Be...","[{'id': '70048', 'description': 'Other Pastora...",[],"[{'id': '110510', 'description': 'Secondary', ..."
90404,Science Teacher,<p>Al Rayan is a leading Management Company ca...,"[{'id': '10804', 'description': 'Teacher', 'ca...","[{'id': '10519', 'description': 'Biology', 'ca...","[{'id': '10502', 'description': 'Independent s..."
90405,Director of Maths (Secondary),<p>Brooke Weston Trust is offering an exciting...,"[{'id': '70031', 'description': 'Subject Leade...","[{'id': '10538', 'description': 'Mathematics',...","[{'id': '110510', 'description': 'Secondary', ..."


In [4]:
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
title,90407,32333,Teacher of English,2700
description,90407,85354,<p>Ever growing in Academies and opportunities...,138
positions,90407,2609,"[{'id': '10804', 'description': 'Teacher', 'ca...",48497
subjects,90407,1790,[],32857
workplaces,90407,1052,"[{'id': '110510', 'description': 'Secondary', ...",24773


In [5]:
pos=pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df['positions']],ignore_index=True)
#pos.columns=['pos_id','pos_description','pos_categoryType','pos_parentId','pos_isPromo'] #renaming position columns to make them distinct as id is common in all 3 objects

In [7]:
pos[pos.id=='10809'] #Wanted to understand relationship with id' for more visual analysis

Unnamed: 0,id,description,categoryType,isPromo,parentId
29,10809,Head of Department,position,,
37,10809,Head of Department,position,,
40,10809,Head of Department,position,,
51,10809,Head of Department,position,,
61,10809,Head of Department,position,,
...,...,...,...,...,...
103561,10809,Head of Department,position,,
103579,10809,Head of Department,position,,
103582,10809,Head of Department,position,,
103599,10809,Head of Department,position,,


In [8]:
pos

Unnamed: 0,id,description,categoryType,isPromo,parentId
0,70024,Deputy/Assistant of Department/Faculty,position,True,
1,10804,Teacher,position,,
2,10803,Business Manager/Bursar,position,,
3,10804,Teacher,position,,
4,10804,Teacher,position,,
...,...,...,...,...,...
103633,70041,Advisor/Consultant,position,,
103634,70048,Other Pastoral/Welfare Positions,position,,
103635,10804,Teacher,position,,
103636,70031,Subject Leadership/Excellence Teacher,position,,


In [9]:
pos.describe()

Unnamed: 0,id,description,categoryType,isPromo,parentId
count,103638,103638,103638,925,2824
unique,54,55,2,1,3
top,10804,Teacher,position,True,2
freq,52365,52365,103481,925,1478


In [10]:
sub=pd.concat([pd.DataFrame(pd.json_normalize(y)) for y in df['subjects']],ignore_index=True)
#sub.columns=['sub_id','sub_description','sub_categoryType','sub_isPromo']

In [11]:
wok=pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df['workplaces']],ignore_index=True)
#wok.columns=['wok_id','wok_description','wok_categoryType','wok_isPromo']

# Building a model for position

In [12]:
pos_d=pd.get_dummies(pos.description, prefix='pos_').iloc[:,1:]#adding dummie variables and no ncluding all to avoid collinearity
pos_d

Unnamed: 0,pos__Advanced Skills Teacher,pos__Advisor/Consultant,pos__Assistant Director/Principal,pos__Assistant Headteacher,pos__Behaviour Manager/Specialist,pos__Business Manager/Bursar,pos__Careers Advisor,pos__Chaplain/Priest,pos__Cover Supervisor,pos__Data Manager/Analyst,...,pos__Second in Charge,pos__Secretary/Receptionist,pos__Specialist,pos__Subject Leadership/Excellence Teacher,pos__Support Manager,pos__Teacher,pos__Teaching Assistant,pos__Technician,pos__Trainer/Assessor/Verifier,pos__Tutor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103633,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [13]:
pos_new = pd.concat([pos, pos_d], axis=1)
pos_new

Unnamed: 0,id,description,categoryType,isPromo,parentId,pos__Advanced Skills Teacher,pos__Advisor/Consultant,pos__Assistant Director/Principal,pos__Assistant Headteacher,pos__Behaviour Manager/Specialist,...,pos__Second in Charge,pos__Secretary/Receptionist,pos__Specialist,pos__Subject Leadership/Excellence Teacher,pos__Support Manager,pos__Teacher,pos__Teaching Assistant,pos__Technician,pos__Trainer/Assessor/Verifier,pos__Tutor
0,70024,Deputy/Assistant of Department/Faculty,position,True,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10804,Teacher,position,,,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,10803,Business Manager/Bursar,position,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10804,Teacher,position,,,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,10804,Teacher,position,,,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103633,70041,Advisor/Consultant,position,,,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103634,70048,Other Pastoral/Welfare Positions,position,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103635,10804,Teacher,position,,,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103636,70031,Subject Leadership/Excellence Teacher,position,,,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
X_pos=pos_new.iloc[:,0] #train 
y_pos=pos_new.iloc[:,5:] #test
y_pos

Unnamed: 0,pos__Advanced Skills Teacher,pos__Advisor/Consultant,pos__Assistant Director/Principal,pos__Assistant Headteacher,pos__Behaviour Manager/Specialist,pos__Business Manager/Bursar,pos__Careers Advisor,pos__Chaplain/Priest,pos__Cover Supervisor,pos__Data Manager/Analyst,...,pos__Second in Charge,pos__Secretary/Receptionist,pos__Specialist,pos__Subject Leadership/Excellence Teacher,pos__Support Manager,pos__Teacher,pos__Teaching Assistant,pos__Technician,pos__Trainer/Assessor/Verifier,pos__Tutor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103633,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_pos,y_pos,test_size=0.3, random_state=42) #Split is 70-30
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)

In [17]:
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,multilabel_confusion_matrix,classification_report
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix, lil_matrix

classifier_pos = KNeighborsClassifier()

# train
classifier_pos.fit(X_train, y_train)

# predict
predictions_pos = classifier_pos.predict(X_test)
accuracy_score(y_test,predictions_pos)

0.9998070243149363

In [18]:
pd.DataFrame(predictions_pos).shape

(31092, 54)

In [19]:
multilabel_confusion_matrix(y_test,predictions_pos)

array([[[30952,     0],
        [    0,   140]],

       [[31001,     0],
        [    0,    91]],

       [[31000,     0],
        [    0,    92]],

       [[30795,     0],
        [    0,   297]],

       [[31009,     0],
        [    0,    83]],

       [[30933,     0],
        [    0,   159]],

       [[31044,     0],
        [    0,    48]],

       [[31079,     0],
        [    0,    13]],

       [[30697,     0],
        [    0,   395]],

       [[31007,     0],
        [    0,    85]],

       [[31061,     0],
        [    0,    31]],

       [[31035,     0],
        [    0,    57]],

       [[30801,     0],
        [    0,   291]],

       [[30868,     0],
        [    0,   224]],

       [[30863,     0],
        [    0,   229]],

       [[30978,     0],
        [    0,   114]],

       [[31068,     0],
        [    0,    24]],

       [[31033,     0],
        [    0,    59]],

       [[31009,     0],
        [    0,    83]],

       [[29280,     0],
        [    0,  1812]],



In [21]:
print("Accuracy :",accuracy_score(y_test,predictions_pos))
print("Classification Report :\n",classification_report(y_test,predictions_pos))

Accuracy : 0.9998070243149363
Classification Report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       140
           1       1.00      1.00      1.00        91
           2       1.00      1.00      1.00        92
           3       1.00      1.00      1.00       297
           4       1.00      1.00      1.00        83
           5       1.00      1.00      1.00       159
           6       1.00      1.00      1.00        48
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00       395
           9       1.00      1.00      1.00        85
          10       1.00      1.00      1.00        31
          11       1.00      1.00      1.00        57
          12       1.00      1.00      1.00       291
          13       1.00      1.00      1.00       224
          14       1.00      1.00      1.00       229
          15       1.00      1.00      1.00       114
          16       1.00   

# Building a model for Subjects

In [22]:
sub_d=pd.get_dummies(sub.description, prefix='sub_').iloc[:,1:]

In [23]:
sub_new = pd.concat([sub, sub_d], axis=1)
sub_new

Unnamed: 0,id,description,categoryType,isPromo,sub__Accounting and Finance,sub__Aerospace Studies,sub__Agriculture/Rural Science,sub__Animal Studies,sub__Archaeology and Anthropology,sub__Architecture and Planning,...,sub__Quran / Islamic Studies,sub__Religious Education,sub__Science,sub__Skills for Life,sub__Social Sciences,sub__Social and Political Studies,sub__Sociology,sub__Special Needs,sub__Teacher Training,sub__Vocational Studies
0,10549,Science,subject,,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,10520,Business Studies and Economics,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10532,Geography,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10549,Science,subject,,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,10519,Biology,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69580,10519,Biology,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69581,10522,Chemistry,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69582,10546,Physics,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69583,10538,Mathematics,subject,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
X_sub=sub_new.iloc[:,0]
y_sub=sub_new.iloc[:,4:]
y_sub

Unnamed: 0,sub__Accounting and Finance,sub__Aerospace Studies,sub__Agriculture/Rural Science,sub__Animal Studies,sub__Archaeology and Anthropology,sub__Architecture and Planning,sub__Art and Design,sub__Biology,sub__Business Studies and Economics,sub__Careers,...,sub__Quran / Islamic Studies,sub__Religious Education,sub__Science,sub__Skills for Life,sub__Social Sciences,sub__Social and Political Studies,sub__Sociology,sub__Special Needs,sub__Teacher Training,sub__Vocational Studies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69580,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
69581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
from sklearn.model_selection import train_test_split
X_train_sub,X_test_sub,y_train_sub,y_test_sub=train_test_split(X_sub,y_sub,test_size=0.3, random_state=42) #Split is 70-30
X_train_sub=pd.DataFrame(X_train_sub)
X_test_sub=pd.DataFrame(X_test_sub)

In [26]:
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,multilabel_confusion_matrix,classification_report
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix, lil_matrix

classifier_sub = KNeighborsClassifier()
# train
classifier_sub.fit(X_train_sub, y_train_sub)

# predict
predictions_sub = classifier_sub.predict(X_test_sub)
accuracy_score(y_test_sub,predictions_sub)

1.0

In [27]:
#Confusion matrix
multilabel_confusion_matrix(y_test_sub,predictions_sub)

array([[[20770,     0],
        [    0,   106]],

       [[20862,     0],
        [    0,    14]],

       [[20855,     0],
        [    0,    21]],

       [[20848,     0],
        [    0,    28]],

       [[20874,     0],
        [    0,     2]],

       [[20872,     0],
        [    0,     4]],

       [[20301,     0],
        [    0,   575]],

       [[20239,     0],
        [    0,   637]],

       [[20278,     0],
        [    0,   598]],

       [[20832,     0],
        [    0,    44]],

       [[20194,     0],
        [    0,   682]],

       [[20816,     0],
        [    0,    60]],

       [[20876,     0],
        [    0,     0]],

       [[20848,     0],
        [    0,    28]],

       [[20814,     0],
        [    0,    62]],

       [[20630,     0],
        [    0,   246]],

       [[20257,     0],
        [    0,   619]],

       [[20567,     0],
        [    0,   309]],

       [[20759,     0],
        [    0,   117]],

       [[20707,     0],
        [    0,   169]],



In [28]:
#Printing performance metrics
print("Accuracy :",accuracy_score(y_test_sub,predictions_sub))
print("Classification Report :\n",classification_report(y_test_sub,predictions_sub))

Accuracy : 1.0
Classification Report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       106
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        21
           3       1.00      1.00      1.00        28
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00       575
           7       1.00      1.00      1.00       637
           8       1.00      1.00      1.00       598
           9       1.00      1.00      1.00        44
          10       1.00      1.00      1.00       682
          11       1.00      1.00      1.00        60
          12       0.00      0.00      0.00         0
          13       1.00      1.00      1.00        28
          14       1.00      1.00      1.00        62
          15       1.00      1.00      1.00       246
          16       1.00      1.00      1.

# Building a model for workplace

In [29]:
wok_d=pd.get_dummies(wok.description, prefix='wok_').iloc[:,1:]
wok_d

Unnamed: 0,wok__Charity,wok__Children's Services,wok__Early Childhood,wok__Educational Body,wok__Further Education,wok__Higher Education,wok__Independent Pre-Prep,wok__Independent Preparatory,wok__Independent Primary,wok__Independent Secondary,...,wok__Outdoor Education,wok__Primary,wok__Pupil Referral Unit (PRU),wok__Secondary,wok__Secure Unit,wok__Special Education,wok__Special education,wok__Summer School,wok__Technical and Further Education,wok__Youth and Community Services
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
103212,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [30]:
wok_new = pd.concat([wok, wok_d], axis=1)
wok_new

Unnamed: 0,id,description,categoryType,isPromo,wok__Charity,wok__Children's Services,wok__Early Childhood,wok__Educational Body,wok__Further Education,wok__Higher Education,...,wok__Outdoor Education,wok__Primary,wok__Pupil Referral Unit (PRU),wok__Secondary,wok__Secure Unit,wok__Special Education,wok__Special education,wok__Summer School,wok__Technical and Further Education,wok__Youth and Community Services
0,10510,Secondary,workplace,,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,10510,Secondary,workplace,,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,110510,Secondary,workplace,,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,110509,Primary,workplace,,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,110509,Primary,workplace,,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103210,10502,Independent senior,workplace,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103211,10510,Secondary,workplace,,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
103212,10500,Higher Education,workplace,,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
103213,110510,Secondary,workplace,,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [31]:
X_wok=wok_new.iloc[:,0] #train
y_wok=wok_new.iloc[:,4:] #test
y_wok

Unnamed: 0,wok__Charity,wok__Children's Services,wok__Early Childhood,wok__Educational Body,wok__Further Education,wok__Higher Education,wok__Independent Pre-Prep,wok__Independent Preparatory,wok__Independent Primary,wok__Independent Secondary,...,wok__Outdoor Education,wok__Primary,wok__Pupil Referral Unit (PRU),wok__Secondary,wok__Secure Unit,wok__Special Education,wok__Special education,wok__Summer School,wok__Technical and Further Education,wok__Youth and Community Services
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
103212,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [32]:
from sklearn.model_selection import train_test_split
X_train_wok,X_test_wok,y_train_wok,y_test_wok=train_test_split(X_wok,y_wok,test_size=0.3, random_state=42) #Split is 70-30
X_train_wok=pd.DataFrame(X_train_wok)
X_test_wok=pd.DataFrame(X_test_wok)

In [33]:
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,multilabel_confusion_matrix,classification_report
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix, lil_matrix

classifier_wok = KNeighborsClassifier()
# train
classifier_wok.fit(X_train_wok, y_train_wok)

# predict
predictions_wok = classifier_wok.predict(X_test_wok)

In [34]:
#Confusion Matrix
multilabel_confusion_matrix(y_test_wok,predictions_wok)

array([[[30838,     0],
        [    0,   127]],

       [[30942,     0],
        [    0,    23]],

       [[30959,     0],
        [    0,     6]],

       [[30832,     1],
        [    0,   132]],

       [[28867,     0],
        [    0,  2098]],

       [[30764,     1],
        [    0,   200]],

       [[30928,     0],
        [   37,     0]],

       [[30418,   134],
        [    0,   413]],

       [[30948,     0],
        [    0,    17]],

       [[30850,     0],
        [    0,   115]],

       [[29780,   242],
        [    0,   943]],

       [[30752,    37],
        [    0,   176]],

       [[30123,     0],
        [  134,   708]],

       [[29051,     0],
        [  242,  1672]],

       [[30959,     0],
        [    1,     5]],

       [[30865,     0],
        [    0,   100]],

       [[30785,     0],
        [    0,   180]],

       [[30731,     0],
        [    0,   234]],

       [[30760,     1],
        [    0,   204]],

       [[30781,     0],
        [    0,   184]],



In [35]:
#Performance matrix
print("Accuracy :",accuracy_score(y_test_wok,predictions_wok))
print("Classification Report :\n",classification_report(y_test_wok,predictions_wok))

Accuracy : 0.986339415469078
Classification Report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       127
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00         6
           3       0.99      1.00      1.00       132
           4       1.00      1.00      1.00      2098
           5       1.00      1.00      1.00       200
           6       0.00      0.00      0.00        37
           7       0.76      1.00      0.86       413
           8       1.00      1.00      1.00        17
           9       1.00      1.00      1.00       115
          10       0.80      1.00      0.89       943
          11       0.83      1.00      0.90       176
          12       1.00      0.84      0.91       842
          13       1.00      0.87      0.93      1914
          14       1.00      0.83      0.91         6
          15       1.00      1.00      1.00       100
          16       1.00    

# Writing a function to get test data

In [36]:
df_test=pd.read_json('test.json')
df_test=pd.DataFrame(df)
df_test

Unnamed: 0,title,description,positions,subjects,workplaces
0,Senior Assistant Director of Science,<h3><strong>Senior Assistant Director of Scien...,"[{'id': '70024', 'description': 'Deputy/Assist...","[{'id': '10549', 'description': 'Science', 'ca...","[{'id': '10510', 'description': 'Secondary', '..."
1,Teacher of Business Studies and HSC,<p>Temporary Teacher of Business and Health an...,"[{'id': '10804', 'description': 'Teacher', 'ca...","[{'id': '10520', 'description': 'Business Stud...","[{'id': '10510', 'description': 'Secondary', '..."
2,School Business Manager,<p><strong>Position: &nbsp;&nbsp;Business Mana...,"[{'id': '10803', 'description': 'Business Mana...",[],"[{'id': '110510', 'description': 'Secondary', ..."
3,EYFS Classroom Teacher,<p><strong>We require an EYFS teacher for Sept...,"[{'id': '10804', 'description': 'Teacher', 'ca...",[],"[{'id': '110509', 'description': 'Primary', 'c..."
4,Teacher,<p>The Federation of Sacred Heart and St Mary’...,"[{'id': '10804', 'description': 'Teacher', 'ca...",[],"[{'id': '110509', 'description': 'Primary', 'c..."
...,...,...,...,...,...
90402,Higher Education Coordinator,"<p>William Morris Sixth Form, situated in Hamm...","[{'id': '70041', 'description': 'Advisor/Consu...",[],"[{'id': '10499', 'description': 'Further Educa..."
90403,Pastoral Support Co-ordinator,"<p>Aspire Academy Bexley, Endeavour Academy Be...","[{'id': '70048', 'description': 'Other Pastora...",[],"[{'id': '110510', 'description': 'Secondary', ..."
90404,Science Teacher,<p>Al Rayan is a leading Management Company ca...,"[{'id': '10804', 'description': 'Teacher', 'ca...","[{'id': '10519', 'description': 'Biology', 'ca...","[{'id': '10502', 'description': 'Independent s..."
90405,Director of Maths (Secondary),<p>Brooke Weston Trust is offering an exciting...,"[{'id': '70031', 'description': 'Subject Leade...","[{'id': '10538', 'description': 'Mathematics',...","[{'id': '110510', 'description': 'Secondary', ..."


In [37]:
#Test objects
pos_test=pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df_test['positions']],ignore_index=True)
wok_test=pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df_test['workplaces']],ignore_index=True)
sub_test=pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df_test['subjects']],ignore_index=True)
    

In [38]:
#Creating a function to get a dataframe for the submission
def get_tags():
    
##Positions   
    dummies1=pd.get_dummies(pos_test.description, prefix='pos_').iloc[:,1:]
    data_new1= pd.concat([pos_test, dummies1], axis=1)
    X1=pd.DataFrame(data_new1.iloc[:,0])
    y1=data_new1.iloc[:,5:]

    #Model predict
    pred_test_pos=pd.DataFrame(classifier_pos.predict(X1))
    pred_test_pos.columns=['pos__Advanced Skills Teacher', 'pos__Advisor/Consultant',
       'pos__Assistant Director/Principal', 'pos__Assistant Headteacher',
       'pos__Behaviour Manager/Specialist', 'pos__Business Manager/Bursar',
       'pos__Careers Advisor', 'pos__Chaplain/Priest', 'pos__Cover Supervisor',
       'pos__Data Manager/Analyst', 'pos__Data Services/Timetabling',
       'pos__Deputy Director/Vice Principal', 'pos__Deputy Headteacher',
       'pos__Deputy/Assistant of Department/Faculty',
       'pos__Director of Faculty/Department', 'pos__Director/Principal',
       'pos__Educational Psychologist', 'pos__Examiner',
       'pos__Graduate Teaching Programme', 'pos__Head of Department',
       'pos__Head of Year', 'pos__Headteacher',
       'pos__Housemaster/Housemistress', 'pos__Human Resources',
       'pos__Instructor', 'pos__Instructor ', 'pos__Leader/Co-ordinator',
       'pos__Leadership/Excellence Teacher', 'pos__Learning Manager/Mentor',
       'pos__Learning Support', 'pos__Lecturer', 'pos__Librarian',
       'pos__Literacy Worker', 'pos__Manager', 'pos__Matron/Nurse',
       'pos__Office Manager', 'pos__Other Facilities Positions',
       'pos__Other Office and Administrative Positions',
       'pos__Other Pastoral/Welfare Positions', 'pos__Other Support Positions',
       'pos__Personal Assistant to Headteacher',
       'pos__Premises Manager/Housekeeper', 'pos__Principal Teacher',
       'pos__Research/Fellowship Post', 'pos__Second in Charge',
       'pos__Secretary/Receptionist', 'pos__Specialist',
       'pos__Subject Leadership/Excellence Teacher', 'pos__Support Manager',
       'pos__Teacher', 'pos__Teaching Assistant', 'pos__Technician',
       'pos__Trainer/Assessor/Verifier', 'pos__Tutor']
    res_pos = pd.DataFrame(pd.Series(pred_test_pos.columns[np.where(dummies1!=0)[1]]))
    res_pos.columns=['Positions']
    
##Subjects   

    dummies2=pd.get_dummies(sub_test.description, prefix='sub_').iloc[:,1:]
    data_new2 = pd.concat([sub_test, dummies2], axis=1)
    X2=pd.DataFrame(data_new2.iloc[:,0])
    y2=data_new2.iloc[:,4:]
        
    #Model predict
    pred_test_sub= pd.DataFrame(classifier_sub.predict(X2))
    pred_test_sub.columns=['sub__Accounting and Finance', 'sub__Aerospace Studies',
       'sub__Agriculture/Rural Science', 'sub__Animal Studies',
       'sub__Archaeology and Anthropology', 'sub__Architecture and Planning',
       'sub__Art and Design', 'sub__Biology',
       'sub__Business Studies and Economics', 'sub__Careers', 'sub__Chemistry',
       'sub__Child Development', 'sub__Christian Studies', 'sub__Citizenship',
       'sub__Classics', 'sub__Construction and Engineering Trades',
       'sub__Design and Technology', 'sub__Drama',
       'sub__EAL/English as an Additional Language',
       'sub__Early Learning/Child Care', 'sub__Education', 'sub__English',
       'sub__English as a Foreign Language', 'sub__Environmental Studies',
       'sub__Food Technology/Hospitality and Catering', 'sub__Geography',
       'sub__Graphics', 'sub__Hair and Beauty',
       'sub__Health, Personal and Social Development', 'sub__History',
       'sub__Humanities', 'sub__Information Technology', 'sub__Jewish Studies',
       'sub__Law', 'sub__Leisure and Travel', 'sub__Logistics and Transport',
       'sub__Mathematics', 'sub__Media Studies', 'sub__Medical',
       'sub__Modern Languages', 'sub__Modern Studies', 'sub__Music',
       'sub__Other Subjects', 'sub__Pastoral/Curriculum Management',
       'sub__Performing Arts', 'sub__Philosophy', 'sub__Physical Education',
       'sub__Physics', 'sub__Politics', 'sub__Psychology',
       'sub__Public Services', 'sub__Quran / Islamic Studies',
       'sub__Religious Education', 'sub__Science', 'sub__Skills for Life',
       'sub__Social Sciences', 'sub__Social and Political Studies',
       'sub__Sociology', 'sub__Special Needs', 'sub__Teacher Training',
       'sub__Vocational Studies']
    res_sub = pd.DataFrame(pd.Series(pred_test_sub.columns[np.where(dummies2!=0)[1]]))
    res_sub.columns=['Subjects']
        
##Workspaces

    dummies3=pd.get_dummies(wok_test.description, prefix='wok_').iloc[:,1:]
    data_new3 = pd.concat([wok_test, dummies3], axis=1)
    X3=pd.DataFrame(data_new3.iloc[:,0])
    y3=data_new3.iloc[:,4:]
        
    #Model predict
    pred_test_wok= pd.DataFrame(classifier_wok.predict(X3))
    pred_test_wok.columns=['wok_Charity', 'wok_Childrens Services', 'wok__Early Childhood',
       'wok__Educational Body', 'wok__Further Education',
       'wok__Higher Education', 'wok__Independent Pre-Prep',
       'wok__Independent Preparatory', 'wok__Independent Primary',
       'wok__Independent Secondary', 'wok__Independent Senior',
       'wok__Independent pre-prep', 'wok__Independent preparatory',
       'wok__Independent senior', 'wok__Library', 'wok__Local Authority',
       'wok__Middle', 'wok__Nursery', 'wok__Other Organisation',
       'wok__Other Workplaces', 'wok__Outdoor Education', 'wok__Primary',
       'wok__Pupil Referral Unit (PRU)', 'wok__Secondary', 'wok__Secure Unit',
       'wok__Special Education', 'wok__Special education',
       'wok__Summer School', 'wok__Technical and Further Education',
       'wok__Youth and Community Services']
    res_wok = pd.DataFrame(pd.Series(pred_test_wok.columns[np.where(dummies3!=0)[1]]))
    res_wok.columns=['Workspaces']    
    
##Final Result
    result=pd.concat([res_pos,res_sub,res_wok], axis=1)
    result['Positions']=result['Positions'].str.replace('pos__', '')
    result['Subjects']=result['Subjects'].str.replace('sub__', '')
    result['Workspaces']=result['Workspaces'].str.replace('wok__', '')
    
    return result
        

In [39]:
Submission=get_tags()

In [40]:
Submission.head(50)

Unnamed: 0,Positions,Subjects,Workspaces
0,Deputy/Assistant of Department/Faculty,Science,Secondary
1,Teacher,Business Studies and Economics,Secondary
2,Business Manager/Bursar,Geography,Secondary
3,Teacher,Science,Primary
4,Teacher,Biology,Primary
5,Data Manager/Analyst,History,Independent Senior
6,Teacher,Religious Education,Secondary
7,Teacher,Mathematics,Secondary
8,Teacher,English,Secondary
9,Teacher,Music,Primary


In [63]:
result = Submission.to_json('Final Submission.json',orient="records")