In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
%matplotlib inline

In [2]:
data = pd.read_csv("./Train_rev1.csv", index_col=None)
data.shape

(244768, 12)

In [9]:
data.sample(3)

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName,log_salary,log_salary_cat,Category_temp
169171,71337200,Senior Practitioner Family Support & Protection,Essex Permanent Full Time Salary up to **** pe...,"South East, Essex",UK,,,Blue Care,Social work Jobs,"upto 42,000 + generous benefits package",42000,jobs.communitycare.co.uk,10.645425,3.0,Social work Jobs
232634,72479394,Cardiac Physiologist South West England,Mediplacements are looking for an experienced ...,"South West,Not Specified",UK,,contract,Jobsite Jobs,Scientific & QA Jobs,40.00 GBP Hourly + Top rates paid + benefits,76800,jobsite.co.uk,11.24896,4.0,Scientific & QA Jobs
162297,71199437,C / ASPnet Developer,C/ASP.NET Developer Adam would like to meeta t...,Greater Manchester,Manchester,,permanent,Adam Recruitment Limited,"PR, Advertising & Marketing Jobs",25k - 35k pa + Dependent on Expience,30000,jobsite.co.uk,10.308953,2.0,"PR, Advertising & Marketing Jobs"


In [7]:
from sklearn.preprocessing import KBinsDiscretizer
data["log_salary"] = np.log(data["SalaryNormalized"])
kb = KBinsDiscretizer(
    n_bins=5, 
    encode="ordinal", 
    strategy="quantile"
).fit(data[["log_salary"]])
data["log_salary_cat"] = kb.transform(data[["log_salary"]]).ravel()
data["Category_temp"] = data["Category"]
data["Category_temp"].replace({"Domestic help & Cleaning Jobs" : "Other", "Part time Jobs": "Other"}, inplace=True)



In [8]:
from sklearn.model_selection import train_test_split
y = data["log_salary"]
X = data.drop(columns=["log_salary", "log_salary_cat", "Category_temp", "SalaryRaw", "SalaryNormalized"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42,
    stratify=data[["Category_temp", "log_salary_cat"]]
)

In [25]:
X_train["Title"].fillna("", inplace=True)
X_train["FullDescription"].fillna("", inplace=True)

In [54]:
X_train["text"] = X_train[["Title", "FullDescription"]].apply(lambda t: t["Title"] + t["FullDescription"], axis=1)

In [55]:
X_train[["text", "Title", "FullDescription"]]

Unnamed: 0,text,Title,FullDescription
133212,Senior Sales ExecutiveSenior Sales Executive ...,Senior Sales Executive,Senior Sales Executive Umbrella and Payroll S...
105742,Digital Account Manager Leading Global Consum...,Digital Account Manager Leading Global Consum...,As Digital Account Manager you will be the int...
67276,GPST**** General Medicine LASGPST**** General ...,GPST**** General Medicine LAS,GPST**** General Medicine Locum Appointment fo...
217942,"Software Support Engineer, DorsetA leading har...","Software Support Engineer, Dorset",A leading hardware and software company servic...
139748,Design Engineer Catia SPMMechanical Design E...,Design Engineer Catia SPM,Mechanical Design Engineer –Swindon– Catia Hux...
...,...,...,...
36568,Area Sales ManagerArea Sales Manager Arts and...,Area Sales Manager,Area Sales Manager Arts and Crafts Products/E...
228391,M&E SupervisorThe role will be overseeing site...,M&E Supervisor,The role will be overseeing site activity on a...
188559,"REGIONAL MANAGER IRELAND, AMAZING LADIES FASHI...","REGIONAL MANAGER IRELAND, AMAZING LADIES FASHI...",We are SEEKING a AREA MANAGERS/GROUP MANAGERS ...
110453,Web Platform AdministratorAn exciting opportun...,Web Platform Administrator,An exciting opportunity to work in part of a B...


In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

text_prep = Pipeline([
    ("enc", TfidfVectorizer(max_features=1000, stop_words="english", dtype="float32")),
    ("unsparse", FunctionTransformer(lambda x: x.toarray())),
    ("embed", PCA(n_components=100)),
])

prep_pipeline = ColumnTransformer([
    ("text", text_prep, "text"),
    ("cat", OneHotEncoder(
        drop="first", handle_unknown="ignore", min_frequency=0.1
    ), ["Category", "LocationNormalized", "ContractType", "ContractTime", "Company"]),
])

In [60]:
from sklearn.linear_model import LinearRegression

model_pipeline = Pipeline([
    ("prep", prep_pipeline),
    ("mod", LinearRegression())
])
model_pipeline.get_params()

{'memory': None,
 'steps': [('prep', ColumnTransformer(transformers=[('text',
                                    Pipeline(steps=[('enc',
                                                     TfidfVectorizer(dtype='float32',
                                                                     max_features=1000,
                                                                     stop_words='english')),
                                                    ('unsparse',
                                                     FunctionTransformer(func=<function <lambda> at 0x000001B56E4236D0>)),
                                                    ('embed',
                                                     PCA(n_components=100))]),
                                    'text'),
                                   ('cat',
                                    OneHotEncoder(drop='first',
                                                  handle_unknown='ignore',
                                       

In [61]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

main_params = {
    "prep__text__embed": [PCA(n_components=100), PCA(n_components=300), "passthrough"],
    'prep__text__enc__max_features': [500, 1000, 3000],
}

boost_params = [
    {
        "mod": [GradientBoostingRegressor()],  
        "mod__loss": ["squared_error", "absolute_error"],
        "mod__max_depth": [2, 3, 6, 10],
        "mod__learning_rate": [0.01, 0.01, 0.1],
        "mod__n_estimators": [1000],
        "mod__subsample": [0.5, 1],
        "mod__max_features": ["sqrt", 1],
        **main_params
    }, 
    {
        "mod": [GradientBoostingRegressor()],  
        "mod__loss": ["huber"],
        "mod__alpha": [0.9, 0.7, 0.5, 0.3, 0.1],
        "mod__max_depth": [2, 3, 6, 10],
        "mod__learning_rate": [0.01, 0.01, 0.1, 0.2, 0.3, 0.5],
        "mod__n_estimators": [1000],
        "mod__subsample": [0.5, 1],
        "mod__max_features": ["sqrt", 1],
        **main_params
    }, 
]
forest_params = [
    {
        "mod": [RandomForestRegressor()],  
        "mod__max_depth": [5, 10, None],
        "mod__min_samples_split": [5, 10, 20, 50],
        "mod__n_estimators": [1000],
        "mod__max_features": ["sqrt", 1],
        "mod__random_state": [42],
        **main_params
    },
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

gs_params = {
    "random_state": 42,
    "n_iter": 100,
    "cv": 5,
    "verbose": 2,
    "error_score": 'raise',
    "scoring": ["r2", "neg_root_mean_squared_error", "neg_mean_absolute_error"],
    "refit": "neg_root_mean_squared_error",
    "n_jobs": 4,
}

boost_gs = RandomizedSearchCV(
    model_pipeline, boost_params, 
    **gs_params
).fit(X_train, y_train)

forest_gs = RandomizedSearchCV(
    model_pipeline, forest_params, 
    **gs_params
).fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [37]:
model_pipeline.score(X_train, y_train)

0.5738945575910028

In [23]:
X_train

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SourceName
133212,70239186,Senior Sales Executive,Senior Sales Executive Umbrella and Payroll S...,"North West England, Cheshire, Lancashire",UK,,,BMS Sales Specialists LLP,Other/General Jobs,theladders.co.uk
105742,69585249,Digital Account Manager Leading Global Consum...,As Digital Account Manager you will be the int...,London,London,full_time,permanent,Hanson Search Ltd,"PR, Advertising & Marketing Jobs",gorkanajobs.co.uk
67276,68718305,GPST**** General Medicine LAS,GPST**** General Medicine Locum Appointment fo...,WesternIsles,Western Isles,,,Western Isles NHS Board,Healthcare & Nursing Jobs,jobs.scot.nhs.uk
217942,72295170,"Software Support Engineer, Dorset",A leading hardware and software company servic...,Dorset Dorset England,Dorset,,,System Recruitment,IT Jobs,gojobsearch.co.uk
139748,70533706,Design Engineer Catia SPM,Mechanical Design Engineer –Swindon– Catia Hux...,Swindon Wiltshire England,Swindon,,,Huxley Associates,Engineering Jobs,gojobsearch.co.uk
...,...,...,...,...,...,...,...,...,...,...
36568,68258411,Area Sales Manager,Area Sales Manager Arts and Crafts Products/E...,South Yorkshire Yorkshire,South Yorkshire,,,BMS Sales Specialists LLP,Sales Jobs,salestarget.co.uk
228391,72446715,M&E Supervisor,The role will be overseeing site activity on a...,North London London South East,North Lambeth,,contract,Linear Recruitment,Trade & Construction Jobs,totaljobs.com
188559,71693104,"REGIONAL MANAGER IRELAND, AMAZING LADIES FASHI...",We are SEEKING a AREA MANAGERS/GROUP MANAGERS ...,UK,UK,,permanent,Elite Associates,Retail Jobs,retailchoice.com
110453,69670601,Web Platform Administrator,An exciting opportunity to work in part of a B...,Exeter,Exeter,,permanent,Landmark Information Group Limited,IT Jobs,jobsite.co.uk
