### Import required pakages

In [None]:
import json
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

### Reading json data

In [None]:
def get_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


### Paths to the input files

In [None]:

file = "data/full_dataset.json"

#### Read files

In [None]:
json_data = get_data(file)


### Separating records and convert to Dataframes

In [None]:
def json2DataFrame(json_data):
    data_ = []
    # Separating records and storing it to dataframes
    for record in json_data:
        text = record["text"].lower()
        job_type = record["full_time_required"]
        drv_lice_req = record["driving_license_required"]
        exp_req = record["experience_required"]
        education = record["education_required"]
        data_.append(  (text, job_type, drv_lice_req, exp_req, education )  )
    data_ = pd.DataFrame(data_, columns = ["text", "full_time_required", "driving_license_required", "experience_required","education_required"])
    return data_

In [None]:
data_ = json2DataFrame(json_data)
data_.head()

### Creating Class and Prediction funtions to Search in lookup and Classifiy input text

In [None]:
class ClassificationLex:   # Creating a classification Class
    def __init__(self): 
        self.job_type_fullTime    = {}
        self.job_type_partTime    = {}
        self.drv_lice_req_true  = {} 
        self.drv_lice_req_false = {}
        self.exp_req_true       = {}
        self.exp_req_false      = {}
        self.education_req      = {}
        self.education_not_req  = {}
        self.ready = None
        self.top_k_stop_words = 80 #  Removing top stop_words
        
    def fit(self, data_df): ### fit function to populate lexicons
        for record in data_df.values:
            text         = record[0]
            job_type     = record[1]
            drv_lice_req = record[2]
            exp_req      = record[3]
            education    = record[4]
            text_splited = text.split()
            
           ######## Sperating words and creating dictionary based on labels of all 8 classes ##########
        
            ##If word dont exist in full_time required dictionary create one otherwise if exits add 1 score
            for word in text_splited:
                if job_type == True:
                    if word not in self.job_type_fullTime:
                        self.job_type_fullTime[word]=1
                    else:
                        self.job_type_fullTime[word]+=1
            ####If word dont exist in full_time not required dictionary create one otherwise if exits add 1 score
                elif job_type == False:
                    if word not in self.job_type_partTime:
                        self.job_type_partTime[word]=1
                    else:
                        self.job_type_partTime[word]+=1
             ####If word dont exist in driving licence required dictionary create one otherwise if exits add 1 score
                if drv_lice_req ==True:
                    if word not in self.drv_lice_req_true:
                        self.drv_lice_req_true[word]=1
                    else:
                        self.drv_lice_req_true[word]+=1
            ####If word dont exist in driving licence not required dictionary create one otherwise if exits add 1 score
                elif drv_lice_req ==False:
                    if word not in self.drv_lice_req_false:
                        self.drv_lice_req_false[word]=1
                    else:
                        self.drv_lice_req_false[word]+=1
            ####If word dont exist in experience required dictionary create one otherwise if exits add 1 score
                if exp_req ==True:
                    if word not in self.exp_req_true:
                        self.exp_req_true[word]=1
                    else:
                        self.exp_req_true[word]+=1
             ####If word dont exist in experience not required dictionary create one otherwise if exits add 1 score
                elif exp_req ==False:
                    if word not in self.exp_req_false:
                        self.exp_req_false[word]=1
                    else:
                        self.exp_req_false[word]+=1
            ####If word dont exist in education required dictionary create one otherwise if exits add 1 score
                if education ==True:
                    if word not in self.education_req:
                        self.education_req[word]=1
                    else:
                        self.education_req[word]+=1
            ####If word dont exist in education not required dictionary create one otherwise if exits add 1 score
                elif education == False:
                    if word not in self.education_not_req:
                        self.education_not_req[word]=1
                    else:
                        self.education_not_req[word]+=1
        
        # removing top freuqest words
        
        self.job_type_fullTime = dict(sorted(self.job_type_fullTime.items(), key=lambda kv: kv[1],
                                             reverse=True)[self.top_k_stop_words:])
        self.job_type_partTime = dict(sorted(self.job_type_partTime.items(), key=lambda kv: kv[1],
                                             reverse=True)[self.top_k_stop_words:])
        
        self.drv_lice_req_true = dict(sorted(self.drv_lice_req_true.items(), key=lambda kv: kv[1],
                                             reverse=True)[self.top_k_stop_words:])
        self.drv_lice_req_false = dict(sorted(self.drv_lice_req_false.items(), key=lambda kv: kv[1],
                                              reverse=True)[self.top_k_stop_words:])
        
        self.exp_req_true = dict(sorted(self.exp_req_true.items(), key=lambda kv: kv[1],
                                        reverse=True)[self.top_k_stop_words:])
        self.exp_req_false = dict(sorted(self.exp_req_false.items(), key=lambda kv: kv[1],
                                         reverse=True)[self.top_k_stop_words:])
        
        self.education_req = dict(sorted(self.education_req.items(), key=lambda kv: kv[1],
                                         reverse=True)[self.top_k_stop_words:])
        self.education_not_req = dict(sorted(self.education_not_req.items(), key=lambda kv: kv[1],
                                             reverse=True)[self.top_k_stop_words:])
        
        print("model is ready for predicions")
        self.ready=True
   
 
###Lookup fucntion that will be able to predict the class based on the scores in dictionary for the task of full_time_required
    
    def predict_job_type_lookup(self, text):
      
        if(self.ready is None):
            print("please fit the model before you perform inference")
            return
        text_list  = [i.lower() for i in text] ### make all capitals small

        result_set = []
        for text_entry in text_list:
            d_score = 0 ### inilize  delted count
            h_score = 0 ### inilize  helted count

            for word in text_entry.split(): ## split text into words
                try:
                    d = self.job_type_fullTime[word] # Search word in Lexicons
                    d_score+=1
                except:
                    pass
                try:
                    h = self.job_type_partTime[word]  # Search word in Lexicons
                    h_score+=1
                except:
                    pass

            result = None

        ### Check which class have greater score based on key_value pairs of dictionary
            if (d_score>h_score): 
                result = True
            elif(d_score<h_score):
                result = False
            elif(d_score==h_score):
                result = False
            else:
                print("error computing scores")
            result_set.append(result)
            result = None
        return result_set
    
####Lookup fucntion that will be able to predict the class based on the scores in dictionary for driving licence
    def predict_driving_license_lookup(self, text):
        """
        input: list of input text to be predicted
        """
        if(self.ready is None):
            print("please fit the model before you perform inference")
            return
        text_list  = [i.lower() for i in text] ### make all capitals small

        result_set = []
        for text_entry in text_list:
            t_score = 0 ### inilize  delted count
            f_score = 0 ### inilize  helted count

            for word in text_entry.split(): ## split text into words
                try:
                    t = self.drv_lice_req_true[word]  # Search word in Lexicons
                    t_score+=1
                except:
                    pass
                try:
                    f = self.drv_lice_req_false[word]  # Search word in Lexicons
                    f_score+=1
                except:
                    pass

            result = None
            
        ### Check which class have greater score based on key_value pairs of dictionary
            if (t_score>f_score):
                result = True
            elif(t_score<f_score):
                result = False
            elif(t_score==f_score):
                result = False
            else:
                print("error computing scores")
            result_set.append(result)
            result = None
        return result_set
   
    ####Lookup fucntion that will be able to predict the class based on the scores in dictionary for Experience
    def predict_experience_lookup(self, text):
        """
        input: list of input text to be predicted
        """
        if(self.ready is None):
            print("please fit the model before you perform inference")
            return
        text_list  = [i.lower() for i in text] ### make all capitals small

        result_set = []
        for text_entry in text_list:
            t_score = 0 ### inilize  delted count
            f_score = 0 ### inilize  helted count

            for word in text_entry.split(): ## split text into words
                try:
                    t = self.exp_req_true[word]  # Search word in Lexicons
                    t_score+=1
                except:
                    pass
                try:
                    f = self.exp_req_false[word]  # Search word in Lexicons
                    f_score+=1
                except:
                    pass

            result = None

            if (t_score>f_score):  
                result = True
            elif(t_score<f_score):
                result = False
            elif(t_score==f_score):
                result = False
            else:
                print("error computing scores")
            result_set.append(result)
            result = None
        return result_set
        
      ####Lookup fucntion that will be able to predict the class based on the scores in dictionary for Education
    def predict_education_lookup(self, text):
        """
        input: list of input text to be predicted
        """
        if(self.ready is None):
            print("please fit the model before you perform inference")
            return
        text_list  = [i.lower() for i in text] ### make all capitals small

        result_set = []
        for text_entry in text_list:
            er_score = 0 ### inilize  delted count
            en_score = 0 ### inilize  helted count

            for word in text_entry.split(): ## split text into words
                try:
                    t = self.education_req[word] # Search word in Lexicons
                    er_score+=1
                except:
                    pass
                try:
                    f = self.education_not_req[word] # Search word in Lexicons
                    en_score+=1
                except:
                    pass

            result = None
            
 ### Check which class have greater score based on key_value pairs of dictionary
            if (er_score >  en_score):
                result = True
            elif(er_score < en_score):
                result =False
            elif(er_score==en_score):
                result = True
            else:
                print("error computing scores")
            result_set.append(result)
            result = None
        return result_set
   
    # Evaluate function to classify all classes and getting accuracy
    def evaluate_all(self, test):
        acc_job = self.evaluate_job(test)
        acc_drv = self.evaluate_drv_licence(test)
        acc_exp = self.evaluate_exp(test)
        acc_edu = self.evaluate_edu(test)
        return {"job_type": acc_job, "driving_lic_req": acc_drv, "exp_req": acc_exp, "edu_req":acc_edu}

    # Evaluate function to classify Job Type task and getting accuracy
    def evaluate_job(self, test):
        text_list         = test["text"]
        pred_job_type = self.predict_job_type_lookup(text_list)
        pred_job_type = [str(i).lower() for i in pred_job_type]
        lbl_jb_type = [str(i).lower() for i in test["full_time_required"]]
        acc_job = accuracy_score(lbl_jb_type, pred_job_type)
        return acc_job

    # Evaluate function to classify Driving licence task and getting accuracy
    def evaluate_drv_licence(self, test):
        text_list         = test["text"]
        lbl_pred_drv = self.predict_driving_license_lookup(text_list)
        lbl_pred_drv = [str(i).lower() for i in lbl_pred_drv]
        lbl_drv_lice = [str(i).lower() for i in test["driving_license_required"]]
        acc_drv = accuracy_score(lbl_drv_lice, lbl_pred_drv)
        return acc_drv

    # Evaluate function to classify Experience task and getting accuracy
    def evaluate_exp(self, test):
        text_list         = test["text"]
        exp_req_pred = self.predict_experience_lookup(text_list)
        exp_req_pred = [str(i).lower() for i in exp_req_pred]
        lbl_exp_req = [str(i).lower() for i in test["experience_required"]]
        acc_exp = accuracy_score(lbl_exp_req, exp_req_pred)
        return acc_exp
       
    # Evaluate function to classify Education task and getting accuracy
    def evaluate_edu(self, test):
        text_list         = test["text"]
        lbl_education_pred = self.predict_education_lookup(text_list)
        lbl_education_pred = [str(i).lower() for i in lbl_education_pred]
        lbl_education = [str(i).lower() for i in test["education_required"]]
        acc_edu = accuracy_score(lbl_education, lbl_education_pred)
        return acc_edu
    

### Split the data into Training and Test data

In [None]:
X         = data_['text']

y_job     = data_['full_time_required']
y_drv_lic = data_['driving_license_required']
y_exp     = data_['experience_required']
y_edu     = data_['education_required']

X_idx = [i for i in range(len(data_))]  
y_idx = [i for i in range(len(data_))]

X_train, X_test, _, _ = train_test_split(X_idx, y_idx, test_size=0.20, random_state=42)

train = data_.loc[X_train]

test  = data_.loc[X_test]


#### Initilize Model Object

In [None]:
model = ClassificationLex() 

#### Fit model on input training data

In [None]:
model.fit(train)

### Evaluating Model for each task

### 1. Job type

In [None]:
text_list = test["text"]

In [None]:
pred_job_type = model.predict_job_type_lookup(text_list)

In [None]:
pred_job_type = [str(i).lower() for i in pred_job_type]

In [None]:
lbl_jb_type = [str(i).lower() for i in test["full_time_required"]] 

#### Compute Accuracy and Error rate

In [None]:
correct = 0
mistakes = 0
total = 0
for txt, gt, pd in zip(text_list, lbl_jb_type, pred_job_type):
    if(gt==pd):
        correct+=1
    else:
        mistakes+=1
    total+=1
print("correct/total:", correct, "/", total)
print("acc:", correct / total)
print("error/total", mistakes, "/", total)
print("error rate:", mistakes/total)

#### Classification Report for Job_type

In [None]:
print(classification_report(lbl_jb_type,  pred_job_type))

#### Compute Precision and Recall for Jobtype

In [None]:
pre_job = precision_score(lbl_jb_type,pred_job_type,pos_label="true")
print(pre_job)
re_job = recall_score(lbl_jb_type,pred_job_type,pos_label="true")
print(re_job)

### 2. Driving licence

In [None]:
lbl_pred_drv = model.predict_driving_license_lookup(text_list)
lbl_pred_drv = [str(i).lower() for i in lbl_pred_drv]

In [None]:
lbl_drv_lice = [str(i).lower() for i in test["driving_license_required"]]

#### Classification Report fro Driving Licence

In [None]:
print(classification_report(lbl_drv_lice,  lbl_pred_drv))

#### Compute Precision and Recall for Driving Licence

In [None]:
pre_dri = precision_score(lbl_drv_lice,lbl_pred_drv,pos_label="true")
print(pre_dri)
re_dri = recall_score(lbl_drv_lice,lbl_pred_drv,pos_label="true")
print(re_dri)

### 3. Experience required

In [None]:
lbl_exp_req = [str(i).lower() for i in test["experience_required"]]

In [None]:
exp_req_pred = model.predict_experience_lookup(text_list)
exp_req_pred = [str(i).lower() for i in exp_req_pred]

#### Classification Report for Experience

In [None]:
print(classification_report(lbl_exp_req,  exp_req_pred))

#### Compute Precision and Recall for Experience

In [None]:
pre_exp = precision_score(lbl_exp_req,exp_req_pred,pos_label="true")
print(pre_exp)
re_exp = recall_score(lbl_exp_req,exp_req_pred,pos_label="true")
print(re_exp)

### 4. Education

In [None]:
lbl_education_pred = model.predict_education_lookup(text_list)
lbl_education_pred = [str(i).lower() for i in lbl_education_pred]

In [None]:
lbl_education = [str(i).lower() for i in test["education_required"]]

#### Classification Report for Education

In [None]:
print(classification_report(lbl_education,  lbl_education_pred))

#### Compute Precision and Recall for Education

In [None]:
pre_edu = precision_score(lbl_education,lbl_education_pred,pos_label="true")
re_edu = recall_score(lbl_education,lbl_education_pred,pos_label="true")
print(pre_edu)
print(re_edu)

#### Accuracy Results for all four tasks

In [None]:
results = model.evaluate_all(test)

print(f"accuracy for job type:        {results['job_type']}")
print(f"accuracy for driving licence: {results['driving_lic_req']}")
print(f"accuracy for experince:       {results['exp_req']}")
print(f"accuracy for education:       {results['edu_req']}")