<a href="https://colab.research.google.com/github/dmika1234/ml_uwr_22/blob/Project/Project/tfidvec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression

In [2]:
# Importing functions
## Locally 
#from funs import *

# Colab
import httpimport
with httpimport.github_repo(
     "dmika1234", "ml_uwr_22", module="Project", branch="Project"
):
     from Project.funs import *

## Loading the data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Locally
# data_url = 'https://github.com/dmika1234/ml_uwr_22/blob/Project/Project/data/fake_job_postings.csv'
# data_path = 'data/fake_job_postings.csv'
# raw_data = pd.read_csv(data_path)


# For colab
data_url = '/content/drive/MyDrive/fake_job_postings.csv'
raw_data = pd.read_csv(data_url)


## Preprocessing with nltk

In [5]:
text_colnames = ['company_profile', 'description', 'requirements', 'benefits']
DataPrep = DataPreprocessor()

# text_data_ls = DataPrep.preprocess_data(text_data=raw_data, column_names=text_colnames, vectorize_fun=list)
text_data_np = DataPrep.preprocess_data(text_data=raw_data, column_names=text_colnames, vectorize_fun=np.array)
text_data_str = DataPrep.preprocess_data(text_data=raw_data, column_names=text_colnames, vectorize_fun=join_fun)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [6]:
working_df = raw_data.copy()

Spliting location into country, state, city

In [7]:
working_df[['country', 'state', 'city']] = working_df['location'].str.split(',', expand=True).iloc[:,0:3]

Splitting salary range into min, max salary

In [8]:
working_df[['salary_min', 'salary_max']] = working_df['salary_range'].str.split('-', expand=True)
working_df[['salary_min', 'salary_max']] = working_df[['salary_min', 'salary_max']].apply(pd.to_numeric, errors='coerce').fillna(0)

In [9]:
target_colname = 'fraudulent'
# Getting numerical colnames and deleting not useful
numerical_colnames = list(working_df.select_dtypes(include='int64').columns)
numerical_colnames = list(set(numerical_colnames) - set(['job_id', target_colname]))
numerical_colnames = numerical_colnames + ['salary_min', 'salary_max']
# Getting other text colnames and deleting not useful
other_text_colnames = list(set(working_df.select_dtypes(include='object').columns) - set(text_colnames))
other_text_colnames = list(set(other_text_colnames) - set(['location', 'salary_range']))
print(numerical_colnames)
print(text_colnames)
print(other_text_colnames)

['has_company_logo', 'has_questions', 'telecommuting', 'salary_min', 'salary_max']
['company_profile', 'description', 'requirements', 'benefits']
['employment_type', 'state', 'function', 'city', 'country', 'title', 'department', 'required_education', 'required_experience', 'industry']


Filling missing values

In [10]:
working_df[text_colnames + other_text_colnames] = working_df[text_colnames + other_text_colnames].fillna('')
working_df[numerical_colnames] = working_df[numerical_colnames].fillna(0)

In [11]:
working_df[other_text_colnames].apply(lambda x: np.unique(x).shape[0]).sort_values()

employment_type            6
required_experience        8
required_education        14
function                  38
country                   91
industry                 132
state                    326
department              1338
city                    2336
title                  11231
dtype: int64

We will only use those with not so much levels(<50 for start)

In [12]:
final_other_text_colnames = ['employment_type', 'required_experience', 'required_education', 'function']

In [13]:
X = pd.get_dummies(working_df[final_other_text_colnames], columns=final_other_text_colnames)
X[numerical_colnames] = working_df[numerical_colnames]
y = working_df[target_colname]

Splitting the data

In [14]:
train_indcs, test_indcs = get_train_test_indcs(raw_data, raw_data['fraudulent'],
 test_size=.1, random_state=42, stratify=raw_data['fraudulent'])

## BoW 

In [15]:
bow = BagofWords(text_data_np, text_colnames)
X_bow = bow.encode_onehot(1000)
X_bow_final = pd.concat((X, X_bow), axis=1)

## Using TfidVectorizer to change text to numerical vectors

In [16]:
TfTrans = TfidTranformer(50)
X_tfdif = TfTrans.vectorize_transform(text_data_str, train_indcs, text_colnames)
X_tfdif_final = pd.concat((X, X_tfdif), axis=1)

company_profile data successfuly transformed!
description data successfuly transformed!
requirements data successfuly transformed!
benefits data successfuly transformed!


# Modeling

### Train-test split

In [17]:
X_tf_train, X_tf_test, y_train, y_test = X_tfdif_final.iloc[train_indcs], X_tfdif_final.iloc[test_indcs], y[train_indcs], y[test_indcs]
X_bow_train, X_bow_test, y_train, y_test = X_bow_final.iloc[train_indcs], X_bow_final.iloc[test_indcs], y[train_indcs], y[test_indcs]

## Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier(random_state = 2137,
                                  class_weight = None)

clf_tree = clf_tree.fit(X_bow_train, y_train)
y_pred_tree = clf_tree.predict(X_bow_test)

In [19]:
res_tree = evaluate_performance(y_test, y_pred_tree) #None
res_tree

{'detection_percentage': 0.7471,
 'precision': 0.7558,
 'accuracy': 0.976,
 'f1_score': 0.7514,
 'auc_roc': 0.8674}

In [20]:
clf_tree2 = DecisionTreeClassifier(random_state = 2137,
                                  class_weight = "balanced")

clf_tree2 = clf_tree2.fit(X_bow_train, y_train)
y_pred_tree2 = clf_tree2.predict(X_bow_test)

In [21]:
res_tree2 = evaluate_performance(y_test, y_pred_tree2)
res_tree2

{'detection_percentage': 0.7471,
 'precision': 0.7222,
 'accuracy': 0.9737,
 'f1_score': 0.7344,
 'auc_roc': 0.8662}

In [22]:
[clf_tree.get_depth(), clf_tree.get_n_leaves(), clf_tree.get_params()]

[57,
 289,
 {'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'random_state': 2137,
  'splitter': 'best'}]

## Cross-validation

In [23]:
from sklearn.model_selection import cross_val_score

clf_tree_cv = DecisionTreeClassifier(random_state = 2137,
                                  class_weight = None)

cross_val_score(clf_tree_cv, X, y, cv=10)

array([0.95357942, 0.96308725, 0.95805369, 0.95190157, 0.9647651 ,
       0.95749441, 0.95861298, 0.96364653, 0.9647651 , 0.9647651 ])

# Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

clf_forest = RandomForestClassifier(random_state = 2137, 
                                    n_estimators = 200,
                                    class_weight = None)

clf_forest = clf_forest.fit(X_bow_train,y_train)
y_pred_forest = clf_forest.predict(X_bow_test)

In [25]:
res_forest = evaluate_performance(y_test, y_pred_forest) #None
res_forest

{'detection_percentage': 0.5977,
 'precision': 1.0,
 'accuracy': 0.9804,
 'f1_score': 0.7482,
 'auc_roc': 0.7989}

In [26]:
clf_forest = RandomForestClassifier(random_state = 2137, 
                                    n_estimators = 200,
                                    class_weight = "balanced_subsample")

clf_forest = clf_forest.fit(X_bow_train,y_train)
y_pred_forest = clf_forest.predict(X_bow_test)

In [27]:
res_forest = evaluate_performance(y_test, y_pred_forest) #balanced_subsample
res_forest

{'detection_percentage': 0.6667,
 'precision': 0.9831,
 'accuracy': 0.9832,
 'f1_score': 0.7946,
 'auc_roc': 0.833}

In [28]:
clf_forest = RandomForestClassifier(random_state = 2137, 
                                    n_estimators = 200,
                                    class_weight = "balanced")

clf_forest = clf_forest.fit(X_bow_train,y_train)
y_pred_forest = clf_forest.predict(X_bow_test)

In [29]:
res_forest = evaluate_performance(y_test, y_pred_forest) #balanced
res_forest

{'detection_percentage': 0.6437,
 'precision': 0.9825,
 'accuracy': 0.9821,
 'f1_score': 0.7778,
 'auc_roc': 0.8215}