In [7]:
import re
import string
import pandas as pd
import numpy as np
from numpy import absolute

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

import xgboost as xgb
from xgboost import XGBRegressor

In [9]:
# import nltk
# try:
#     nltk.download('stopwords', quiet=True)
#     nltk.download('punkt', quiet=True)
# except FileExistsError:
#     pass

# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

In [60]:
train = pd.read_csv("../datasets/Train_rev1.csv")
# test  = pd.read_csv("Test_rev1.csv")
# valid = pd.read_csv("Valid_rev1.csv")

In [61]:
train.head(3)

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk


In [62]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244768 entries, 0 to 244767
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Id                  244768 non-null  int64 
 1   Title               244767 non-null  object
 2   FullDescription     244768 non-null  object
 3   LocationRaw         244768 non-null  object
 4   LocationNormalized  244768 non-null  object
 5   ContractType        65442 non-null   object
 6   ContractTime        180863 non-null  object
 7   Company             212338 non-null  object
 8   Category            244768 non-null  object
 9   SalaryRaw           244768 non-null  object
 10  SalaryNormalized    244768 non-null  int64 
 11  SourceName          244767 non-null  object
dtypes: int64(2), object(10)
memory usage: 22.4+ MB


In [63]:
train.SalaryNormalized.describe()

count    244768.000000
mean      34122.577576
std       17640.543124
min        5000.000000
25%       21500.000000
50%       30000.000000
75%       42500.000000
max      200000.000000
Name: SalaryNormalized, dtype: float64

In [64]:
train = train.sample(frac=1)

In [65]:
X = train[["Title", "FullDescription", "LocationNormalized", "ContractTime", "Company", "Category", "SourceName"]].fillna(0)

y = train.SalaryNormalized.fillna(0)

In [66]:
X.head()

Unnamed: 0,Title,FullDescription,LocationNormalized,ContractTime,Company,Category,SourceName
149456,Laminator,My Walsall Based Client is looking for a lamin...,Walsall,contract,Brookstreet UK,Manufacturing Jobs,cv-library.co.uk
185900,Technical Support Engineer (Voice and Networking),Job Title Technical Support Engineer (Voice a...,London,permanent,Bluefire Consulting,Other/General Jobs,totaljobs.com
199036,Marketing Administrator 6 month contract,Established company based between Gloucester a...,Cheltenham,contract,Adecco Group,"PR, Advertising & Marketing Jobs",totaljobs.com
220843,Solution Architect/Business Consultant Retail...,Solution Architect Business Consultant Retai...,Gipsy Hill,permanent,Spargonet Consulting,IT Jobs,jobg8.com
158402,Account Executive Leading PR Agency Consumer...,The Company. An incredible opportunity to join...,Central London,permanent,Reuben Sinclair,"PR, Advertising & Marketing Jobs",gorkanajobs.co.uk


In [67]:
X['concat_features'] = X.astype(str).apply(' '.join, axis=1)

In [68]:
X.concat_features

149456    Laminator My Walsall Based Client is looking f...
185900    Technical Support Engineer (Voice and Networki...
199036    Marketing Administrator  6 month contract Esta...
220843    Solution Architect/Business Consultant  Retail...
158402    Account Executive  Leading PR Agency  Consumer...
                                ...                        
15741     STRATEGIC LEADER OF ICT Aspiration and Achieve...
103504    Mechanical Maintenance Engineer Mechanical Mai...
172246    OD And Talent HRBP FTC Our client, an internat...
174458    Liability Insurance Claims Advisor Background ...
187803    Yard amp; Logistics Supervisor Company Overvie...
Name: concat_features, Length: 244768, dtype: object

In [69]:
"""
Text Sanitazation
"""

def text_sanitazation(content):
    stop_words = set(stopwords.words("english"))

    # Making its content lower case
    content = content.lower()

    # Removing HTML Tags
    html_removal_code = re.compile('<.*?>') 
    content = re.sub(html_removal_code, '', content)

    # Removing ponctuation
    content = content.translate(str.maketrans("", "", string.punctuation))

    # Removing white spaces
    content = content.strip()

    return content

In [70]:
X.concat_features = X.concat_features.apply(text_sanitazation)

In [71]:
X.concat_features

149456    laminator my walsall based client is looking f...
185900    technical support engineer voice and networkin...
199036    marketing administrator  6 month contract esta...
220843    solution architectbusiness consultant  retailg...
158402    account executive  leading pr agency  consumer...
                                ...                        
15741     strategic leader of ict aspiration and achieve...
103504    mechanical maintenance engineer mechanical mai...
172246    od and talent hrbp ftc our client an internati...
174458    liability insurance claims advisor background ...
187803    yard amp logistics supervisor company overview...
Name: concat_features, Length: 244768, dtype: object

In [72]:
"""
Train test splitting
"""

X_train, X_test, y_train, y_test = train_test_split(X.concat_features, y, test_size=0.33, random_state=67)
# X_train, X_test, y_train, y_test = train_test_split(X.concat_features.iloc[:50000], y.iloc[:50000], test_size=0.33, random_state=67)

In [73]:
"""
Text Vectorizing using TFIDF-VECTORIZER
"""
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    norm='l2',
    min_df=0,
    smooth_idf=False,
#     max_features=15000)
    max_features=7000)

# word_vectorizer.fit(X.concat_features)
word_vectorizer.fit(X_train)

# Xtr_vec = word_vectorizer.transform(X.concat_features)
Xtr_vec = word_vectorizer.transform(X_train)
Xte_vec = word_vectorizer.transform(X_test)

In [74]:
Xtr_vec

<33500x7000 sparse matrix of type '<class 'numpy.float64'>'
	with 4707945 stored elements in Compressed Sparse Row format>

In [75]:
Xte_vec

<16500x7000 sparse matrix of type '<class 'numpy.float64'>'
	with 2322153 stored elements in Compressed Sparse Row format>

### Linear Regression 

In [25]:
reg = LinearRegression(normalize=True)

In [26]:
"""
Kfold
"""

'\nKfold\n'

In [27]:
cv = KFold(n_splits=5)
scores = cross_val_score(reg, Xtr_vec, y_train, scoring='neg_mean_absolute_error', cv=cv)

In [28]:
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 8817.498 (22.769)


In [None]:
"""
Holdout
"""

In [29]:
reg.fit(Xtr_vec, y_train)

LinearRegression(normalize=True)

In [30]:
y_pred = reg.predict(Xte_vec)

In [31]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE: {mae} | MSE: {mse} | RMSE: {rmse}")

MAE: 8454.812514771824 | MSE: 141284293.05195296 | RMSE: 11886.306955987337


### XGBoost Regressor 

In [41]:
xgbr = XGBRegressor(random_state=12)

In [42]:
"""
Kfold
"""

'\nKfold\n'

In [43]:
cv = KFold(n_splits=5)
scores = cross_val_score(xgbr, Xtr_vec, y_train, scoring='neg_mean_absolute_error', cv=cv)

In [44]:
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 7896.183 (66.573)


In [45]:
"""
Holdout
"""

'\nHoldout\n'

In [46]:
xgbr.fit(Xtr_vec, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=12,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [47]:
score = xgbr.score(Xtr_vec, y_train)  
print("Training score: ", score)

Training score:  0.8092366976985482


In [48]:
y_pred = xgbr.predict(Xte_vec)

In [49]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE: {mae} | MSE: {mse} | RMSE: {rmse}")

MAE: 7943.081751657197 | MSE: 136057376.2495602 | RMSE: 11664.363516693065


### SVR

In [76]:
svr = SVR(kernel='rbf', C=1.0, epsilon=0.2)

In [77]:
"""
Kfold
"""

'\nKfold\n'

In [None]:
cv = KFold(n_splits=5)
scores = cross_val_score(svr, Xtr_vec, y_train, scoring='neg_mean_absolute_error', cv=cv)

In [None]:
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [54]:
"""
Holdout
"""

'\nHoldout\n'

In [55]:
svr.fit(Xtr_vec, y_train)

SVR(epsilon=0.2, kernel='linear')

In [56]:
y_pred = svr.predict(Xte_vec)

In [57]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE: {mae} | MSE: {mse} | RMSE: {rmse}")

MAE: 12934.77046820659 | MSE: 330628498.31186867 | RMSE: 18183.19274252651
