# Classifying Job Postings
## Building the Model

This file contains the old logistic regression model that I created. However, the accuracy on this model differed greatly depending on the rows that were selected when I balance the dataset. Therefore, I decided to create a new text classifier model in `model.py`.

In [3]:
# Importing packages
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [4]:
# Reading csv
df = pd.read_csv("fake_job_postings.csv")

# Cleaning/modifying data
split_location = df['location'].str.split(",", n = 2, expand = True)
df['country'] = split_location[0]
df['state/province'] = split_location[1]
df['city'] = split_location[2]
df['has_company_profile'] = np.where(pd.isna(df['company_profile']), 0, 1)
# df['description_count'] = 
df.drop(columns =['location', 'company_profile'], inplace = True)

In [5]:
df = pd.get_dummies(df, columns = ['employment_type', 'required_experience', 'required_education'], drop_first=True)

In [6]:
# Creating balanced dataframe
fraudulent = df[df['fraudulent'] == 1]
allReal = df[df['fraudulent'] == 0]
real = allReal.sample(len(fraudulent))
balancedDf = pd.concat([fraudulent, real])

# Separating the independent and dependent variables
x, y = balancedDf.drop(['job_id', 'title', 'department', 'salary_range', 'description', 'requirements', 'benefits', 'industry', 'function', 'country', 'state/province', 'city', 'fraudulent', 'title'], axis=1), balancedDf['fraudulent']

# Splitting training, testing, validation datasets
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.30)
x_validation, x_test, y_validation, y_test = train_test_split(x_temp, y_temp, test_size=0.50)

In [17]:
lr = LogisticRegression(penalty='none', max_iter=1000)
lr.fit(x_train, y_train)

y_test_pred = lr.predict(x_test)
y_test_prob = lr.predict_proba(x_test)

print("% match is " ,sum(y_test == y_test_pred)/len(y_test))

print(lr.coef_, lr.intercept_)

def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    print('\nReport:\n', classification_report(y_true, y_pred))
    
y_pred = lr.predict(x_validation) 
metrics(y_validation, y_pred)

% match is  0.7076923076923077
[[-1.45996965e-01 -1.27573625e+00 -3.85991334e-01 -1.88239559e+00
   3.26243674e-03 -1.26523682e-01  6.35400526e-01 -1.24128088e+00
   1.89186698e+00 -9.10152825e-02  1.83063329e+00 -1.37282591e+00
   5.84669802e-01 -5.39153723e-01  2.02813225e-01  2.66053264e+00
   9.52795530e+00  1.91350763e+00  9.08942138e-01  3.64359389e-01
   1.97554965e-01  1.88011521e+00  4.65837955e-02 -1.18021437e+01
   0.00000000e+00  0.00000000e+00]] [1.46815766]
Confusion matrix:
 [[ 97  28]
 [ 33 102]]

Report:
               precision    recall  f1-score   support

           0       0.75      0.78      0.76       125
           1       0.78      0.76      0.77       135

    accuracy                           0.77       260
   macro avg       0.77      0.77      0.77       260
weighted avg       0.77      0.77      0.77       260





In [18]:
# Example of predicting with the model
query_df = pd.DataFrame([{'telecommuting' : 0, 
                          'has_company_logo' : 1, 
                          'has_questions' : 1, 
                          'has_company_profile' : 1, 
                          'employment_type_Full-time' : 1,
                          'employment_type_Other' : 0, 
                          'employment_type_Part-time' : 0,
                          'employment_type_Temporary': 0,
                          'required_experience_Director' : 0,
                          'required_experience_Entry level' : 0,
                          'required_experience_Executive' : 0,
                          'required_experience_Internship' : 0,
                          'required_experience_Mid-Senior level' : 1,
                          'required_experience_Not Applicable' : 0,
                          "required_education_Bachelor's Degree" : 0,
                          'required_education_Certification' : 0, 
                          'required_education_Doctorate' : 0,
                          'required_education_High School or equivalent' : 1,
                          "required_education_Master's Degree" : 0,
                          'required_education_Professional' : 0,
                          'required_education_Some College Coursework Completed' : 0,
                          'required_education_Some High School Coursework' : 0,
                          'required_education_Unspecified' : 0,
                          'required_education_Vocational' : 0,
                          'required_education_Vocational - Degree' : 0,
                          'required_education_Vocational - HS Diploma' : 0}])
pred = lr.predict(query_df)
pred_prob = lr.predict_proba(query_df)
print(pred, pred_prob)

[1] [[0.39520753 0.60479247]]


## Exporting Model

In [None]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(lr, 'app/model.pkl')