### Imports

In [1]:
import re
import string
import pandas as pd
import numpy as np
from numpy import absolute

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

import xgboost as xgb
from xgboost import XGBRegressor

In [2]:
import nltk
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
except FileExistsError:
    pass

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Importing and analyzing the dataset

In [3]:
train = pd.read_csv("../datasets/Train_rev1.csv")

In [4]:
train.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,"Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...","Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244768 entries, 0 to 244767
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Id                  244768 non-null  int64 
 1   Title               244767 non-null  object
 2   FullDescription     244768 non-null  object
 3   LocationRaw         244768 non-null  object
 4   LocationNormalized  244768 non-null  object
 5   ContractType        65442 non-null   object
 6   ContractTime        180863 non-null  object
 7   Company             212338 non-null  object
 8   Category            244768 non-null  object
 9   SalaryRaw           244768 non-null  object
 10  SalaryNormalized    244768 non-null  int64 
 11  SourceName          244767 non-null  object
dtypes: int64(2), object(10)
memory usage: 22.4+ MB


In [6]:
train.SalaryNormalized.describe()

count    244768.000000
mean      34122.577576
std       17640.543124
min        5000.000000
25%       21500.000000
50%       30000.000000
75%       42500.000000
max      200000.000000
Name: SalaryNormalized, dtype: float64

### Preprocessing the data

In [7]:
train = train.sample(frac=1)

In [19]:
X = train[["Title", "FullDescription", "LocationNormalized", "ContractTime", "Company", "Category", "SourceName"]].fillna("missing value")

y = train.SalaryNormalized.fillna(0)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244768 entries, 136883 to 86921
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Title               244768 non-null  object
 1   FullDescription     244768 non-null  object
 2   LocationNormalized  244768 non-null  object
 3   ContractTime        244768 non-null  object
 4   Company             244768 non-null  object
 5   Category            244768 non-null  object
 6   SourceName          244768 non-null  object
dtypes: object(7)
memory usage: 14.9+ MB


In [9]:
categorized_X = pd.concat([
        X.select_dtypes([], ['object']),
        X.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(X.columns, axis=1)

In [10]:
categorized_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244768 entries, 119737 to 52640
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   Title               244768 non-null  category
 1   FullDescription     244768 non-null  category
 2   LocationNormalized  244768 non-null  category
 3   ContractTime        244768 non-null  category
 4   Company             244768 non-null  category
 5   Category            244768 non-null  category
 6   SourceName          244768 non-null  category
dtypes: category(7)
memory usage: 24.4 MB


In [11]:
categorized_X.head()

Unnamed: 0,Title,FullDescription,LocationNormalized,ContractTime,Company,Category,SourceName
119737,Behaviour Support Teaching Assistant,Hays Education are working in partnership with...,Leicester,contract,0,Teaching Jobs,hays.co.uk
46313,Car Sales Executive Car Jobs in Essex,Car Sales Executive Motor Trade Jobs in Essex...,Chelmsford,permanent,London4Jobs,Sales Jobs,london4jobs.co.uk
162046,Mobile Application Developer,Win Technologies has a brand new opportunity f...,London,permanent,Win Technologies Ltd,IT Jobs,jobsite.co.uk
234364,Business Development Manager London,Job Title &ndash Business Development Manager ...,London,permanent,h2 Recruit Ltd,Sales Jobs,strike-jobs.co.uk
138993,Headhunting Role for Competitive Grad/Second J...,Maiden Marc Associates are currently working e...,London,permanent,MAIDEN MARC ASSOC LTD,Sales Jobs,jobs.guardian.co.uk


In [50]:
X.loc[:, X.dtypes == 'object'] = X.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [13]:
X.info()

Unnamed: 0,Title,FullDescription,LocationNormalized,ContractTime,Company,Category,SourceName
119737,Behaviour Support Teaching Assistant,Hays Education are working in partnership with...,Leicester,contract,0,Teaching Jobs,hays.co.uk
46313,Car Sales Executive Car Jobs in Essex,Car Sales Executive Motor Trade Jobs in Essex...,Chelmsford,permanent,London4Jobs,Sales Jobs,london4jobs.co.uk
162046,Mobile Application Developer,Win Technologies has a brand new opportunity f...,London,permanent,Win Technologies Ltd,IT Jobs,jobsite.co.uk
234364,Business Development Manager London,Job Title &ndash Business Development Manager ...,London,permanent,h2 Recruit Ltd,Sales Jobs,strike-jobs.co.uk
138993,Headhunting Role for Competitive Grad/Second J...,Maiden Marc Associates are currently working e...,London,permanent,MAIDEN MARC ASSOC LTD,Sales Jobs,jobs.guardian.co.uk


In [10]:
X['concat_features'] = X.astype(str).apply(' '.join, axis=1)

In [11]:
X.concat_features.head()

136883    IOS lead developer iOS Lead Developer, London ...
171180    General Manager Our Engineering client based i...
238174    Registered General Nurse – Tunbridge Wells Sta...
210347    Ophthalmic clinical nurse / technician  Readin...
183559    Estimator  Interior and Exterior Refurbishment...
Name: concat_features, dtype: object

In [20]:
"""
Text Sanitazation
"""

def text_sanitazation(content):
    stop_words = set(stopwords.words("english"))
    
    # Making its content lower case
    content = content.lower()

    # Removing HTML Tags
    html_removal_code = re.compile('<.*?>') 
    content = re.sub(html_removal_code, '', content)

    # Removing ponctuation
    content = content.translate(str.maketrans("", "", string.punctuation))

    # Removing white spaces
    content = content.strip()
    
    # Removing stop words
    word_tokens = word_tokenize(content)
    filtered_text = ''
    for word in word_tokens:
        if word not in stop_words:
            filtered_text = filtered_text + word + " "
    content = filtered_text.strip()

    return content

In [13]:
X.concat_features = X.concat_features.apply(text_sanitazation)

In [14]:
X.concat_features.head()

136883    ios lead developer ios lead developer london c...
171180    general manager engineering client based rochd...
238174    registered general nurse – tunbridge wells sta...
210347    ophthalmic clinical nurse technician reading j...
183559    estimator interior exterior refurbishment fit ...
Name: concat_features, dtype: object

In [18]:
# X["cleaned_description"] = X.FullDescription.apply(text_sanitazation)
# X.head()

In [None]:
"""
Enconding all columns
"""

In [21]:
X.Title = X.Title.apply(text_sanitazation)
X.FullDescription = X.FullDescription.apply(text_sanitazation)
X.LocationNormalized = X.LocationNormalized.apply(text_sanitazation)
X.ContractTime = X.ContractTime.apply(text_sanitazation)
X.Company = X.Company.apply(text_sanitazation)
X.Category = X.Category.apply(text_sanitazation)
X.SourceName = X.SourceName.apply(text_sanitazation)

In [22]:
le = LabelEncoder()

X_copy = X.copy()

X_copy['Title'] = le.fit_transform(X_copy['Title'])
X_copy['FullDescription'] = le.fit_transform(X_copy['FullDescription'])
X_copy['LocationNormalized'] = le.fit_transform(X_copy['LocationNormalized'])
X_copy['ContractTime'] = le.fit_transform(X_copy['ContractTime'])
X_copy['Company'] = le.fit_transform(X_copy['Company'])
X_copy['Category'] = le.fit_transform(X_copy['Category'])
X_copy['SourceName'] = le.fit_transform(X_copy['SourceName'])

In [23]:
X_copy.head()

Unnamed: 0,Title,FullDescription,LocationNormalized,ContractTime,Company,Category,SourceName
136883,56434,114907,1491,2,11564,13,150
171180,44022,75743,1993,2,1772,8,25
238174,89543,213971,1303,1,916,10,117
210347,73810,122573,1964,1,9593,10,146
183559,37383,56017,563,2,18246,27,162


In [17]:
"""
Train test splitting
"""

# whole dataset
X_train, X_test, y_train, y_test = train_test_split(X.concat_features, y, test_size=0.33, random_state=67)

# cutted dataset
# X_train, X_test, y_train, y_test = train_test_split(X.concat_features.iloc[:100000], y.iloc[:100000], test_size=0.25, random_state=67)

# only full description
# X_train, X_test, y_train, y_test = train_test_split(X.cleaned_description, y, test_size=0.33, random_state=67)

### Text Vectorization process

In [32]:
"""
Text Vectorizing using TFIDF-VECTORIZER
"""
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(2, 2),
    norm='l2',
    min_df=0,
    smooth_idf=False,
#     max_features=15000)
    max_features=7000)

# word_vectorizer.fit(X.concat_features)
word_vectorizer.fit(X_train)

# Xtr_vec = word_vectorizer.transform(X.concat_features)
Xtr_vec = word_vectorizer.transform(X_train)
Xte_vec = word_vectorizer.transform(X_test)

In [26]:
"""
Using Bag of Words
"""
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english', analyzer='word')

X_bow = count_vect.fit(X_train)

Xtr_vec = X_bow.transform(X_train)
Xte_vec = X_bow.transform(X_test)

### XGBoost Regressor 

https://xgboost.readthedocs.io/en/stable/python/python_api.html


Usando bigrama com TDIDF e removendo stopword:<br>
MAE: 7691.859941605192 | MSE: 122899306.2126854 | RMSE: 11085.99595041805

In [26]:
xgbr = XGBRegressor(
    random_state=12, 
    n_estimators=150,
    max_depth=8, 
    learning_rate=0.4,
    verbosity=1
)

In [23]:
# Kfold

In [24]:
cv = KFold(n_splits=5)
scores = cross_val_score(xgbr, Xtr_vec, y_train, scoring='neg_mean_absolute_error', cv=cv)

In [25]:
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 6988.755 (34.745)


In [26]:
# Holdout

In [34]:
xgbr.fit(Xtr_vec, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.4, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=150, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=12,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [35]:
score = xgbr.score(Xtr_vec, y_train)  
print("Training score: ", score)

Training score:  0.7415196773176778


In [36]:
y_pred = xgbr.predict(Xte_vec)

In [37]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE: {mae} | MSE: {mse} | RMSE: {rmse}")

MAE: 7691.859941605192 | MSE: 122899306.2126854 | RMSE: 11085.99595041805


In [38]:
# Holdout (using the whole dataset label enconded)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_copy, y, test_size=0.33, random_state=67)

In [27]:
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.4, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=150, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=12,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [28]:
score = xgbr.score(X_train, y_train)  
print("Training score: ", score)

Training score:  0.7403527259847542


In [29]:
y_pred = xgbr.predict(X_test)

In [30]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE: {mae} | MSE: {mse} | RMSE: {rmse}")

MAE: 7950.230200998999 | MSE: 132088104.93746936 | RMSE: 11492.958928729771


In [31]:
"""
RandomForestRegressor (test)
"""

''

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
model = RandomForestRegressor(n_jobs=-1)

In [34]:
model.fit(X_train,y_train)

RandomForestRegressor(n_jobs=-1)

In [35]:
y_pred = model.predict(X_test)

In [36]:
mae_rand_forest = mean_absolute_error(y_test,y_pred)

In [37]:
mae_rand_forest

8610.384372032237