<h4>$$ Job\ matching\ skills $$</h4>

#### Importing required libraries

In [0]:
import pandas as pd
import numpy as np
import ast

In [0]:
import warnings
warnings.filterwarnings("ignore")

#### Creating a widgets

In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.text("skill_requirement", "['boiler marker']", "skill_requirement")
dbutils.widgets.text("Experience", "[2]", "Experience")

In [0]:
skill_requirement = ast.literal_eval(dbutils.widgets.get("skill_requirement"))
Experience = ast.literal_eval(dbutils.widgets.get("Experience"))

In [0]:
print(skill_requirement, Experience)

#### Reading the data

In [0]:
# Reading the worker data.
worker_data = spark.read.option("header", True).csv("/mnt/tf-abfss/data/ds/job_matching/Job matching dataset.csv", inferSchema=True)

In [0]:
# converting pyspark to pandas dataframe
worker_pdf = worker_data.toPandas()
worker_pdf.head()

Unnamed: 0,Unique ID,First Name,Last Name,Place of residence,Years of Exp,Gender,Skill set,Preferred cities
0,B00000337-P2,JOSEPH,BOTTA,BROOKLYN,6,men,Ground worker,BROOKLYN
1,B00003646-I1,SOHEL,AHMED,RICHMONDHILL,1,men,Carpenter,BROOKLYN
2,B00000388-P3,SHAWN,STILES,WASHINGTON,6,men,Steel Fixer,BROOKLYN
3,B00000511-I1,WAYNE,NORBECK,NEW YORK,3,women,Concrete worker,BROOKLYN
4,B00001693-I1,PAUL,BAILEY,NEW YORK,1,men,Electrician,BROOKLYN


In [0]:
# Renaming the years of experience as Experince.
worker_pdf.rename(columns = {"Years of Exp":"Experience"}, inplace=True)

In [0]:
worker_pdf['Skill set'].value_counts()

In [0]:
worker_pdf['Preferred cities'].value_counts()

##### Function to convert the values in the dataframe to lower case.

In [0]:
# converting all the values in the dataframe to lower-case. 
def lower_case_values(df):
    col = df.select_dtypes(include="object").columns.to_list()
    for i in col:
        df[i] = df[i].str.lower()
    return df

In [0]:
#lower_case_values(company_pdf)

In [0]:
lower_case_values(worker_pdf)

Unnamed: 0,Unique ID,First Name,Last Name,Place of residence,Experience,Gender,Skill set,Preferred cities
0,b00000337-p2,joseph,botta,brooklyn,6,men,ground worker,brooklyn
1,b00003646-i1,sohel,ahmed,richmondhill,1,men,carpenter,brooklyn
2,b00000388-p3,shawn,stiles,washington,6,men,steel fixer,brooklyn
3,b00000511-i1,wayne,norbeck,new york,3,women,concrete worker,brooklyn
4,b00001693-i1,paul,bailey,new york,1,men,electrician,brooklyn
...,...,...,...,...,...,...,...,...
1494,q00039795-i1,leon,skvirsky,huntingdon valley,6,men,steel fixer,queens
1495,q00039801-i1,xiaohong,zhao,flushing,6,men,ground worker,queens
1496,q00039803-i1,leon,skvirsky,huntingdon valley,17,women,electrician,queens
1497,q00039809-i1,xiaohong,zhao,flushing,1,men,steel fixer,queens


#### Matching the records from worker data using company requirement.

In [0]:
# filtering the worker data with required skills set.
def filtering_records(dataframe, required_skill, required_experience):
    lst_index = dataframe[dataframe["Skill set"].isin(skill_requirement)].index.to_list()
    filtered_records = dataframe[(dataframe.index.isin(lst_index)) & (dataframe['Experience'].isin(Experience))]
    return filtered_records

In [0]:
# calling the function with required parameters.
filtered_records = filtering_records(worker_pdf, skill_requirement, Experience)

In [0]:
filtered_records

Unnamed: 0,Unique ID,First Name,Last Name,Place of residence,Experience,Gender,Skill set,Preferred cities
9,q00003779-i1,raymond,chan,flushing,2,men,steel fixer,queens
13,q00004061-i1,daniel,odigie,hempstead,2,men,boiler marker,queens
62,q00005042-i1,leon,skvirsky,huntingdon valley,2,men,boiler marker,queens
63,q00005043-i1,leon,skvirsky,huntingdon valley,3,men,steel fixer,queens
75,q00005258-i1,anthony,depasquale,garden city,2,men,steel fixer,queens
...,...,...,...,...,...,...,...,...
1413,q00038412-i1,kojo,simpson,hollis,3,men,steel fixer,queens
1417,q00038481-i1,anastasios,tzakas,baldwin,3,non-binary gendered employees,steel fixer,queens
1457,q00039394-i1,mohammad,alauddin,jamaica,3,men,steel fixer,queens
1482,q00039642-i1,alfredo,fredericks,east elmhurst,2,women,steel fixer,queens


#### TFIDF- vectorizer

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# converting the object column of skill set to numerical using tfidf-vectorizer.
worker_v_1 = TfidfVectorizer(ngram_range=(1,2),stop_words="english")
tranform_df = worker_v_1.fit_transform(filtered_records['Skill set'].to_list())
print(worker_v_1.vocabulary_)

In [0]:
# assigning the feature names from transformed vectors to a variable.
feature_names_1 = worker_v_1.get_feature_names()

# checking the importance of the word in the dataframe with inverse document frequency value.
for word in feature_names_1:
    index = worker_v_1.vocabulary_.get(word)
    print(f"{word} {worker_v_1.idf_[index]}")

In [0]:
# creating a pandas dataframe using vectors values of the skills with feature names.
trns_df = pd.DataFrame(tranform_df.toarray(), columns = feature_names_1)
featured_experience = pd.DataFrame(filtered_records['Experience'].reset_index().drop(columns='index'))
trns_df.head()

Unnamed: 0,boiler,boiler marker,fixer,marker,steel,steel fixer
0,0.0,0.0,0.57735,0.0,0.57735,0.57735
1,0.57735,0.57735,0.0,0.57735,0.0,0.0
2,0.57735,0.57735,0.0,0.57735,0.0,0.0
3,0.0,0.0,0.57735,0.0,0.57735,0.57735
4,0.0,0.0,0.57735,0.0,0.57735,0.57735


In [0]:
# Here we are concatinating the experience column with transformed vector dataframe.
new_df = pd.concat([trns_df, featured_experience], axis=1)
new_df.set_index("indexs", inplace=True)

#### Defining the independent and dependent variables

In [0]:
inputs = new_df.drop(columns='Experience', axis=1)
target = new_df['Experience']

##### Splitting the data in to train and test

In [0]:
# we are splitting the data with 80% as train data and 20% on test data.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.20, random_state=100)

In [0]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [0]:
import mlflow
mlflow.sklearn.autolog()

In [0]:
# KNN classifier
with mlflow.start_run(run_name="matching_classifier"):
    # get mlflow run Id
    run_id = mlflow.active_run().info.run_id
    
    # KNN classifer Model
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=7)
    knn.fit(X_train, y_train)
    
    # predicting the data
    y_test_pred = knn.predict(X_test)
    y_pred = knn.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    mlflow.log_metric("accuracy_score",score)

#### Registered the model

In [0]:
model_name = "Matching_skills"

model_uri = f"runs:/{ run_id }/model"
registered_model_version = mlflow.register_model(model_uri, model_name)