## Source Code

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pickle
from time import perf_counter
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
from sklearn.linear_model import Ridge, SGDRegressor, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

load_dotenv(find_dotenv())

try:
    from src.config import COLUMNS
except ModuleNotFoundError:
    import sys
    ROOT_DIR = os.getenv('ROOT_DIR')
    if ROOT_DIR not in sys.path:
        sys.path.append(ROOT_DIR)
        
from src.config import COLUMNS
from src.Data import DataParser, DataProcessor
from src.utils.plotters import (plot_scatter_and_histograms, 
    plot_correlations)
from src.Model import train_models, evaluate_model, process_test_data

np.random.seed(12)

In [3]:
DATASET_DIR = Path(os.getenv('DATASET_DIR'))
MEDIA_DIR = os.getenv('MEDIA_DIR')
PICKLE_DIR = os.getenv('PICKLE_DIR')
DATASET_RAW = DATASET_DIR / 'data.csv'
DATASET_PARSED = DATASET_DIR / 'data_parsed.csv'

## Data Parsing

In [4]:
# parser = DataParser(data_path=DATASET_RAW)
# data_parsed = parser.parse(relevance=COLUMNS.RELEVANCE_COL, query_id=COLUMNS.QUERY_COL,
#     document_id=COLUMNS.DOCUMENT_COL, feature_list=COLUMNS.FEATURE_LIST)

# data_parsed.to_csv(DATASET_PARSED, index=False, header=None, sep=' ')

In [5]:
data = pd.read_csv(DATASET_PARSED, sep=' ', header=None)
data.columns = [COLUMNS.RELEVANCE_COL, COLUMNS.QUERY_COL, *COLUMNS.FEATURE_LIST, COLUMNS.DOCUMENT_COL]
assert data.shape[1] == 49, 'Data does not have 49 columns.'

## Data Preview 

In [6]:
data.head()

Unnamed: 0,Relevance,Query id,TF(Term frequency) of body,TF of anchor,TF of title,TF of URL,TF of whole document,IDF(Inverse document frequency) of body,IDF of anchor,IDF of title,...,LMIR.JM of title,LMIR.JM of URL,LMIR.JM of whole document,PageRank,Inlink number,Outlink number,Number of slash in URL,Length of URL,Number of child page,Document_id
0,0,10,0.0,0.0,0.0,0.0,0.0,7.240045,23.625574,22.686609,...,,,,0.15,0.0,3.0,1.0,17.0,0.0,GX000-00-0000000
1,1,10,115.0,4.0,2.0,1.0,122.0,7.240045,23.625574,22.686609,...,-17.417649,-19.775857,-17.402933,0.395218,12.0,15.0,3.0,56.0,0.0,GX000-24-12369390
2,1,10,289.0,1.0,2.0,2.0,294.0,7.240045,23.625574,22.686609,...,-18.658086,-19.455287,-19.00691,0.156656,1.0,13.0,6.0,76.0,0.0,GX000-62-7863450
3,1,10,70.0,6.0,4.0,3.0,83.0,7.240045,23.625574,22.686609,...,-15.31405,-18.511568,-15.730176,0.160588,2.0,7.0,6.0,95.0,0.0,GX016-48-5543459
4,0,10,145.0,0.0,3.0,1.0,149.0,7.240045,23.625574,22.686609,...,-18.763536,-20.16401,-18.930452,0.196897,1.0,6.0,3.0,36.0,0.0,GX037-87-3082362


In [7]:
null_count = data.isnull().sum()
# null_count

In [9]:
## There are highly correlated columns, they will be handled in the following section.
heatmap = plot_correlations(data, save=False)
# heatmap.show()

In [8]:
## Scatter plots for each feature and the Relevance, together with histograms for a single feature
scatter_hist_unprocessed = plot_scatter_and_histograms(data.drop(columns=[COLUMNS.QUERY_COL, COLUMNS.DOCUMENT_COL]), save=False)
# scatter_hist_unprocessed.show()

## Comments on Unprocessed Data

 - `Document id` has ~65k unique values, too large for it to be used as a target.
 - `Relevance` is highly imbalanced in favour of 0. it may not be used as a classification target without over/under sampling.
 - Each `Query id` has different number of documents associated.
 - There are certain columns that are highly correlated(heatmap), it can be harmful for linear models. it needs to be handled.
 - There are certain columns that have mostly zeros(histograms), for example `Number of child page`. They may be handled depending on the feature importance.
 - Columns starting with `LMIR` have a high missing value ratio(null_count). They will be either dropped or filled depending on the ratio of null values.
 - Columns starting with `TF` or `TF*IDF` have a high skewness(histograms). They will be transformed to make them more normal depending on the skewness value.
 - Some columns could be a good indicator about relevance score(scatter). `Length of URL` or `Number of slash in URL` could help predicting 0, for example. 
 - The features values are not on the same scale. They will be scaled before model training.

`DataProcessor` class will take the unprocessed data and perform the following steps in order;
- Drop columns having a null ratio larger than threshold(0.3). The remaining null columns will be filled with their mean.
- Apply PowerTransform to the columns whose skewness is larger than the threshold(1.0)
- Calculate correlations between columns and drop one of the highly correlated columns.
- Apply StandardScaler on the features before training.

In [9]:
data_processor = DataProcessor(
    dataframe=data,
    relevance=COLUMNS.RELEVANCE_COL,
    query_id=COLUMNS.QUERY_COL,
    document_id=COLUMNS.DOCUMENT_COL
)
data_processed = data_processor.process()
data_processed.head()

Unnamed: 0,Relevance,Query id,TF of whole document,IDF of whole document,TF*IDF of whole document,DL of anchor,DL of title,DL of whole document,BM25 of whole document,LMIR.ABS of URL,LMIR.DIR of title,LMIR.JM of anchor,LMIR.JM of whole document,PageRank,Inlink number,Outlink number,Number of slash in URL,Length of URL,Document_id
0,0,10,-2.534214,-1.209678,-0.28214,-1.620282,0.097315,-1.507657,-0.762059,-0.866915,-0.444454,-2.132536,-0.193213,-0.791695,-1.711672,0.332982,-2.355576,-3.213291,GX000-00-0000000
1,1,10,0.725382,-1.209678,-0.117994,0.797433,0.097315,0.193416,1.40234,1.249162,1.823825,-0.798217,0.386129,1.001268,1.530104,1.186544,-1.026665,0.386896,GX000-24-12369390
2,1,10,1.250905,-1.209678,0.327201,0.899297,1.607694,0.993202,0.949565,0.757818,1.685471,-0.792164,0.230073,-0.712411,-0.312935,1.120934,0.966702,1.223616,GX000-62-7863450
3,1,10,0.490217,-1.209678,-0.105006,1.280447,1.374502,0.00301,1.419453,1.178066,1.929276,-0.238175,0.560429,-0.666746,0.274834,0.810576,0.966702,1.812487,GX016-48-5543459
4,0,10,0.846284,-1.209678,-0.049336,-0.027473,1.678088,0.537073,-0.762059,0.888596,1.607646,-0.965324,0.237289,-0.282798,-0.312935,0.727232,-1.026665,-0.886862,GX037-87-3082362


In [42]:
with open(f'{PICKLE_DIR}/data_processor.pkl', 'wb') as processor_file:
    pickle.dump(data_processor, processor_file)

## Comments on modelling

Ranking can be achieved in different ways;
<br><br>
**Pointwise:** Considers every query-document pair as independent and tries to predict the relevance score for that pair. It could be a good option for a starting point. It discards the relation between documents for a query.

| Input             | Target (relevance score) |
| -----------       | -----------              |
| Q1, D1 features   | 2                        |
| Q1, D2 features   | 0                        |
|     ...           |        ...               |


**Pairwise:** Considers the relation between the combinations of documents as well. 

| Input                  | Target (1 if first document is more relevant) |
| -----------            | -----------              |
| Q1, (D1,D3) features   | 1                        |
| Q1, (D2,D7) features   | 0                        |
|     ...                |        ...               |


**Listwise**: Improves the pairwise methods by considering the all documents for a query.
 
| Input                   | Target (relevance scores for each document)   |
| -----------             | -----------                                   |
| Q1, (D1,...) features   | (2,0,1,...)                                   |
| Q2, (D2,...) features   | (0,0,1,...)                                   |
|     ...                 |        ...                                    |

*Since I have little prior knowledge on Ranking, I will stick with a simpler approach and consider the pointwise method. Since the relevance score is imbalanced i will go with a regression model instead of a classifition model.*

## Decisions on modelling

- Since data has more than a few feature, to avoid overfitting I will use RidgeRegressor as one of the models.
- RandomForestRegressor as an ensemble option.
- I will split the data as %80-train %20-test in a stratified way on the relevance score.
- Features will be input, Relevance score will be the target.

*I tried different models such as SVR, SGDRegressor, Lasso but they did not improve ndcg score much*

## Splitting Data and Training

In [10]:
features = data_processed.loc[:, data_processed.columns.isin(COLUMNS.FEATURE_LIST)]
target = data_processed[COLUMNS.RELEVANCE_COL]

feature_train, feature_val, target_train, target_val = train_test_split(
    features, target, test_size=0.2, random_state=12, stratify=target
)

models = {
    'ridge': Ridge(solver='auto'),
    # 'random_forest': RandomForestRegressor(n_estimators=80, max_depth=10, random_state=12),

}
results = train_models(x_train=feature_train, y_train=target_train, save=False, **models)
results

{'ridge': {'model': Ridge(), 'elapsed': 0.019866}}

## Evaluation

In [11]:
# For each query in the validation set, calculate ndcg score. {query:ndcg_score}
ndcg_val = evaluate_model(x_val=feature_val, y_val=target_val, df_processed=data_processed, model=results['ridge']['model'])
average_ndcg_score = sum(ndcg_val.values()) / len(ndcg_val)
average_ndcg_score

0.5220139178487002

## Predictions

In [40]:
test_data_processed = process_test_data(DATASET_DIR / 'test.csv', processor=data_processor)
test_data_processed

Unnamed: 0,TF of whole document,IDF of whole document,TF*IDF of whole document,DL of anchor,DL of title,DL of whole document,BM25 of whole document,LMIR.ABS of URL,LMIR.DIR of title,LMIR.JM of anchor,LMIR.JM of whole document,PageRank,Inlink number,Outlink number,Number of slash in URL,Length of URL
0,-2.534214,-1.209678,-0.28214,-1.620282,0.097315,-1.507657,-0.762059,-0.866915,-0.444454,-2.132536,0.404071,-0.791695,-1.711672,0.332982,-2.355576,-3.213291
1,-0.396788,-1.209678,-0.222967,-0.027473,0.466251,-0.878243,1.477344,1.250112,-0.444454,-0.249346,0.573851,-0.778598,-0.312935,-1.044387,-0.362209,0.284867
2,-0.282394,-1.209678,-0.236282,0.68215,0.617935,-0.791327,0.729951,0.427134,1.607646,-1.390566,-0.148873,1.232822,-0.312935,0.810576,-1.026665,-0.576384
3,-0.256776,-1.209678,-0.203476,0.797433,-0.137235,-1.188431,1.364254,1.238516,1.900934,-0.220148,0.507077,1.513542,1.003362,0.626118,1.631158,1.642274
4,-0.208483,-1.209678,-0.188775,0.899297,-0.817451,-1.092953,1.429398,1.314399,1.99231,-0.069867,0.789103,1.680576,1.003362,0.810576,1.631158,1.582598


In [41]:
def predict(test_df, model):
    preds = model.predict(test_df)
    
    return preds

predict(test_df=test_data_processed, model=results['ridge']['model'])

array([0.02663432, 0.38388373, 0.14187799, 0.28170344, 0.34645533])

# References
- [Intro to Learn to Rank](https://everdark.github.io/k9/notebooks/ml/learning_to_rank/learning_to_rank.html)
- [Learning to Rank with scikitlearn](http://fa.bianp.net/blog/2012/learning-to-rank-with-scikit-learn-the-pairwise-transform/)
- [Ranking methods](https://www.youtube.com/watch?v=yKwTAcsV8K8&ab_channel=ritvikmath)
- [Ndcg](https://www.youtube.com/watch?v=6BGCn3h59nA&ab_channel=Databricks)
- [Partially inspected](https://fse.studenttheses.ub.rug.nl/18052/1/AI_BA_2018_KLEMENVONCINA.pdf)