# Queries

- "Apple pie"
- "Chicken" in the African category
- "Easy bread" less than 2h
- "Pasta bolognese"
- "Oatmeal"

Prioritize doing 3 of the queries first and running test models, then add more if possible.

# Scoring

Human classification of top 100 results obtained using the standard system (LTR-less). Scoring is done on a numeric scale from 0-5.

## Criteria

The attribution of a given score is a bit subjective but tries to follow the following guidelines:

0. A document that does not match the query.
1. A document that vaguely matches the query, is very incomplete (missing important fields, like instructions) and has no reviews. Or has very negative reviews.
2. A document that partially matches the query, is incomplete and has no reviews. Or a document with negative reviews.
3. A document that matches the query semantically, is reasonably complete (may miss more than two fields) and has at least one positive review.
4. A document that perfectly or almost perfectly matches the query semantically, is complete or missing just one of the fields and has a good number of positive reviews (5 to 20).
5. A document that perfectly matches the query semantically, is complete (the recipe has a full ingredient list, steps and cook time/nutritional information) and has a lot of positive reviews (more than 20).

In [109]:
import urllib.parse as urlp

URL = "http://localhost:8983/solr/recipes/select"
URL += "?rows=100"
URL += "&q.op=AND"
URL += "&q={q}"
URL += "&qf=" + "Name^5 Description Ingredients^2 Keywords^2 Instructions Reviews^0.5 AuthorName^0.2"
URL += "&wt=json"
URL += "&defType=edismax"
URL += "&fl=id,RecipeId,score,[features]"
URL += "&rq={{!ltr model=myModel reRankDocs=100 efi.text={q}}}"
URL += "&fq={fq}"

query = ["apple pie", "chicken", "easy bread", "pasta bolognese", "oatmeal"]
facet = ["", "Category_Facet:African", "", "", ""]
urls = [URL.format(q=query[i], fq=facet[i]) for i in range(len(query))]


In [7]:
import requests
import simplejson
import pandas as pd

for (idx, url) in enumerate(urls):
    response = requests.request("GET", url)
    json = simplejson.loads(response.text)

    for doc in json["response"]["docs"]:
        doc["URL"] = "http://localhost:3000/recipe/{0}".format(doc["RecipeId"]) 
        doc["query"] = query[idx]
        doc["facet"] = facet[idx]
    
    df = pd.DataFrame(json["response"]["docs"])
    df.to_csv("queries/query{0}_results.csv".format(idx+1), index=False)

# Modelling

Solr's LTR implementation supports two different kinds of models: Linear and Tree Based.

There are various algorithms that may be used in order to create these models. We will demonstrate the use of two, one for each type:

- A Linear Model using Support Vector Machines
- A Neural Network model built using RankNet

We will use SciKit Learn's SVM implementation and a RankNet implementation built into Keras.

In [102]:
import pandas as pd
import requests
import simplejson
import glob

result_files = glob.glob("queries/*_results.csv")
scores_files = glob.glob("queries/*_scores.csv")

result_files.sort()
scores_files.sort()

inputs = pd.concat((pd.read_csv(file) for file in result_files), ignore_index=True)
scores = pd.concat((pd.read_csv(file) for file in scores_files), ignore_index=True)

X = []
Y = [entry.score for entry in scores.itertuples()]


In [103]:
def get_features(entry):
    req_url = "http://localhost:8983/solr/recipes/select?rows=100&q.op=AND&q={q}&qf=Name^5%20Description%20Ingredients^2%20Keywords^2%20Instructions%20Reviews^0.5%20AuthorName^0.2&wt=json&defType=edismax&fl=[features]&fq={fq}&rq={rq}"
    facet = entry.facet
    if pd.isna(facet):
        facet = ""
    response = requests.request("GET", req_url.format(q=entry.query, fq=f"RecipeId:{entry.RecipeId} {facet}", rq=f"{{!ltr model=myModel efi.text='{entry.query}'}}"))
    json = simplejson.loads(response.text)
    return [float(feature.split("=")[1]) for feature in json["response"]["docs"][0]["[features]"].split(",")]


for entry in inputs.itertuples():
    X.append(get_features(entry))


In [87]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

scaler.fit(X)
X = scaler.fit_transform(X)

(train_x,
 test_x,
 train_y,
 test_y) = train_test_split(X, Y, test_size=0.25, random_state=1, stratify=Y)

 

In [104]:
from sklearn import svm, linear_model
from sklearn.metrics import r2_score

linearSVM = svm.LinearSVR()
lienarReg = linear_model.LinearRegression()

linearSVM.fit(train_x, train_y)
lienarReg.fit(train_x, train_y)

pred_svm = linearSVM.predict(test_x)
pred_reg = lienarReg.predict(test_x)

r2_score(test_y, pred_svm)



0.02679156372444591

In [108]:
with open("train_file.dat", 'w') as file:
    for i in range(len(result_files)):
        in_f = pd.read_csv(result_files[i])
        s_f = pd.read_csv(scores_files[i])
        feats = [" ".join([f"{idx+1}:{f}" for idx, f in enumerate(get_features(entry))]) for entry in in_f.itertuples()]
        r_ids = [entry.RecipeId for entry in in_f.itertuples()]
        scores = [entry.score for entry in s_f.itertuples()]
        file.writelines(f"{s} qid:{i} {f} # {id}\n" for f, s, id in zip(feats, scores, r_ids))



In [118]:
for (i, url) in enumerate(urls):
    response = requests.request("GET", url)
    json = simplejson.loads(response.text)

    scores = pd.read_csv(scores_files[i])

    for doc in json["response"]["docs"]:
        doc["score"] = scores.loc[scores["RecipeId"] == doc["RecipeId"]]["score"].values
        if len(doc["score"]) > 0:
            doc["score"] = doc["score"][0]
        else:
            doc["score"] = ""
        
    df = pd.DataFrame(json["response"]["docs"])
    df.to_csv("queries/query{0}_ltr.csv".format(i+1), index=False)
        

In [119]:
def probability_satisfied(grade):
    return (pow(2, grade) - 1) / 32

def probability_not_satisfied(probs):
    pns = 1
    for p in probs:
        pns = pns * (1 - p)
    return pns

def err(grades):
    err = 0
    probs = []

    for i, grade in enumerate(grades):
        k = i + 1
        ps = probability_satisfied(grade)
        pns = probability_not_satisfied(probs)
        
        err = err + (1 / k) * ps * pns

        probs.append(ps)

    return err

print(err([3, 2, 4]))

0.365997314453125


In [124]:
ltr_files = glob.glob("queries/*_ltr.csv")
ltr_files.sort()

for (i, ltr_file) in enumerate(ltr_files):
    og_scores = [entry.score for entry in pd.read_csv(scores_files[i]).itertuples()][:15]
    ltr_scores = [entry.score for entry in pd.read_csv(ltr_file).itertuples()][:15]
    print(f"==== QUERY {i+1} ====")
    print(f"Original: {og_scores}")
    print(f"Original score: {err(og_scores)}")
    print(f"Ltr: {ltr_scores}")
    print(f"Ltr score: {err(ltr_scores)}")
    print()


==== QUERY 1 ====
Original: [4, 2, 3, 3, 2, 2, 2, 5, 3, 3, 3, 2, 3, 3, 3]
Original score: 0.5892181081973864
Ltr: [3.0, 3.0, 4.0, 3.0, 3.0, 3.0, 5.0, 3.0, 4.0, 3.0, 3.0, 3.0, 4.0, 3.0, 5.0]
Ltr score: 0.457488682640339

==== QUERY 2 ====
Original: [4, 4, 2, 3, 3, 5, 4, 4, 4, 3, 4, 2, 2, 2, 2]
Original score: 0.6506136785874781
Ltr: [4, 5, 4, 1, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0]
Ltr score: 0.7290675037691942

==== QUERY 3 ====
Original: [2, 4, 5, 3, 1, 5, 4, 1, 4, 4, 2, 4, 3, 2, 4]
Original score: 0.46439582030465915
Ltr: [3.0, 5.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 3.0, 4.0, 5.0, 3.0, 5.0]
Ltr score: 0.6035058210613998

==== QUERY 4 ====
Original: [3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3, 2, 3, 3, 2]
Original score: 0.364180810561534
Ltr: [3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3]
Ltr score: 0.4076726652419933

==== QUERY 5 ====
Original: [4, 2, 3, 3, 1, 4, 4, 3, 3, 4, 3, 4, 4, 4, 4]
Original score: 0.5915420940740799
Ltr: [4.0, 5.0, 5.0, 3.0, 3.0, 5.0, 5.0, 4.0, 4.0, 4.0, 5.0, 4.0, 4.