In [2]:
import pandas as pd

# Import prepared test file
search = pd.read_csv('search.csv')

In [3]:
# Only use numeric fields.
search_numeric = search.select_dtypes(['number'])

In [4]:
# Drop id related columns.
search_numeric = search_numeric.drop(['Unnamed: 0', 'id', 'product_uid'], axis=1)

In [5]:
search_numeric.head()

Unnamed: 0,relevance,desc_pct,title_pct,att_pct,desc_pct_prod,title_pct_prod,att_pct_prod,desc_phrase,title_phrase,att_phrase,search_split,title_split,desc_split
0,3.0,0.5,0.5,0.5,0.007752,0.25,0.014925,0,0,0,2,4,129
1,2.5,0.5,0.5,0.5,0.007752,0.25,0.014925,0,0,0,2,4,129
2,3.0,0.5,0.5,1.0,0.005988,0.090909,0.013514,0,0,0,2,11,167
3,2.33,0.333333,0.333333,0.666667,0.009615,0.076923,0.019802,0,0,0,3,13,104
4,2.67,1.0,1.0,1.0,0.028846,0.230769,0.029703,0,1,0,3,13,104


In [6]:
# Separate dependent and independent variables.
X_search = search_numeric.drop(['relevance'], axis=1)
y_search = search_numeric['relevance']

In [7]:
from sklearn.model_selection import train_test_split

# Split into train and test sets. k
X_train, X_test, y_train, y_test = train_test_split(X_search, y_search, test_size=0.3, random_state=44)

In [8]:
# Create dictionary that will store results of models.
rsme = {}

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt

# Initiate scaler.
scale = StandardScaler()

# Initiate regressor.
lr = LinearRegression()

# Create pipeline.
pl = Pipeline(steps=[('scale', scale), ('linreg', lr)])

# Create grid of parameters for tuning.
parameters_lr = {'linreg__fit_intercept':[True,False], 'linreg__normalize':[True,False], 'linreg__copy_X':[True, False]}

# Tune parameters.
grid_lr = GridSearchCV(pl, parameters_lr, cv=10)

# Fit model to training data.
grid_lr.fit(X_train, y_train)

# Generate predictions using the test set.
preds_lr = grid_lr.predict(X_test)

rsme['Linear Regression'] = sqrt(mean_squared_error(y_test, preds_lr))

In [10]:
# Lasso Regression

from sklearn.linear_model import Lasso

scale = StandardScaler()
lasso = Lasso()

pl = Pipeline(steps=[('scale', scale), ('lasso', lasso)])

parameters_lasso = {'lasso__alpha': [0.001, 0.01, 0.1], 'lasso__normalize':[True, False]}
grid_lasso = GridSearchCV(pl, parameters_lasso, cv=10)

grid_lasso.fit(X_train, y_train)

preds_lasso = grid_lasso.predict(X_test)

rsme['Lasso Regression'] = sqrt(mean_squared_error(y_test, preds_lasso))

In [11]:
# Ridge Regression

from sklearn.linear_model import Ridge

scale = StandardScaler()
ridge = Ridge()

pl = Pipeline(steps=[('scale', scale), ('ridge', ridge)])

parameters_ridge = {'ridge__alpha': [0.001, 0.01, 0.1], 'ridge__normalize':[True, False]}
grid_ridge = GridSearchCV(pl, parameters_ridge, cv=10)

grid_ridge.fit(X_train, y_train)

preds_ridge = grid_ridge.predict(X_test)

rsme['Ridge Regression'] = sqrt(mean_squared_error(y_test, preds_ridge))

In [12]:
# Random Forest Regression

from sklearn.ensemble import RandomForestRegressor

scale = StandardScaler()
rf = RandomForestRegressor(random_state=0)

pl = Pipeline(steps=[('scale', scale), ('rf', rf)])

parameters_rf = {'rf__n_estimators':[10, 15, 20], 'rf__max_depth':[4, 6, 8]}
grid_rf = GridSearchCV(pl, parameters_rf, cv=10)

grid_rf.fit(X_train, y_train)

preds_rf = grid_rf.predict(X_test)

rsme['Random Forest Regression'] = sqrt(mean_squared_error(y_test, preds_rf))

In [13]:
# SVM Regression (Grid Search omitted due to lenghty runtime)
from sklearn.svm import SVR

scale = StandardScaler()
svm = SVR()

pl = Pipeline(steps=[('scale', scale), ('svm', svm)])

pl.fit(X_train, y_train)

preds_svm = pl.predict(X_test)

rsme['SVM Regression'] = sqrt(mean_squared_error(y_test, preds_svm))

In [14]:
rsme

{'Lasso Regression': 0.49168357756907066,
 'Linear Regression': 0.4916733522802432,
 'Random Forest Regression': 0.48237982767047655,
 'Ridge Regression': 0.49168492960201243,
 'SVM Regression': 0.48984574477012166}

In [21]:
import operator

# Find model with the lowest rsme value.
best_model = min(rsme.items(), key=operator.itemgetter(1))[0]

print(f'The model with the lowest RSME value was {best_model}.')
print(f'RSME: {rsme[best_model]:.3f}')

The model with the lowest RSME value was Random Forest Regression.
RSME: 0.482


In [20]:
# Try to improve model through bagging.
from sklearn.ensemble import BaggingRegressor

bag = BaggingRegressor(grid_rf, n_estimators=50, max_samples=0.1)

bag.fit(X_train, y_train)

preds = bag.predict(X_test)

sqrt(mean_squared_error(y_test, preds))

0.48291548566140874

The Bagging Regressor did not improve the results.

In [26]:
# Import full test set.
test = pd.read_csv('search_test.csv')

In [28]:
# Perform same cleaning steps as above.
test_numeric = test.select_dtypes(['number'])
test_numeric = test_numeric.drop(['Unnamed: 0', 'id', 'product_uid'], axis=1)

In [31]:
# Fit chosen model to provided test set.
scale = StandardScaler()
rf = RandomForestRegressor(random_state=0)

pl = Pipeline(steps=[('scale', scale), ('rf', rf)])

parameters_rf = {'rf__n_estimators':[10, 15, 20], 'rf__max_depth':[4, 6, 8]}
grid_rf = GridSearchCV(pl, parameters_rf, cv=10)

grid_rf.fit(X_search, y_search)

preds_rf = grid_rf.predict(test_numeric)

In [33]:
# Prepare submission file for Kaggle.
submit = pd.DataFrame({'id':test['id'], 'relevance':preds_rf})
submit = submit_rf.drop_duplicates(['id'])
submit.to_csv(r'submission.csv', index=False)

The Kaggle submission gives a score of .48724.  This is consistent with the results above and the model translates well to unseen data.