In [1]:
!pip install sentence_transformers

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('/kaggle/input/')

from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import torch

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l- \ done
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=aedaef6401ce6558c558c895aed35e84620030a856e13f3f229fbfa4a3e61029
  Stored in directory: /root/.cache/pip/wheels/83/71/2b/40d17d21937fed496fb99145227eca8f20b4891240ff60c86f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
top_n = 10
model_paths = ["all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2", "multi-qa-MiniLM-L6-cos-v1", "multi-qa-mpnet-base-dot-v1","multi-qa-mpnet-base-cos-v1"]

In [3]:
# Load the datasets
topics_df = pd.read_csv("learning-equality-curriculum-recommendations/topics.csv")
content_df = pd.read_csv("learning-equality-curriculum-recommendations/content.csv")
corr_df = pd.read_csv("learning-equality-curriculum-recommendations/correlations.csv")
submission = pd.read_csv("learning-equality-curriculum-recommendations/sample_submission.csv")

In [4]:
content_df.columns = ["content_"+ column for column in content_df.columns]

corr_df["content_ids"] = corr_df["content_ids"].str.split()
corr_df = corr_df.rename(columns = {"content_ids":"true_content"})
corr_df.head()

Unnamed: 0,topic_id,true_content
0,t_00004da3a1b2,"[c_1108dd0c7a5d, c_376c5a8eb028, c_5bc0e1e2cba..."
1,t_00068291e9a4,"[c_639ea2ef9c95, c_89ce9367be10, c_ac1672cdcd2..."
2,t_00069b63a70a,[c_11a1dc0bfb99]
3,t_0006d41a73a8,"[c_0c6473c3480d, c_1c57a1316568, c_5e375cf14c4..."
4,t_0008768bdee6,"[c_34e1424229b4, c_7d1a964d66d5, c_aab93ee667f4]"


In [5]:
topics_df_topic_tree = pd.DataFrame()

for channel in tqdm(topics_df["channel"].unique()):
    channel_df = topics_df[(topics_df["channel"] == channel)].reset_index(drop = True)
    for level in sorted(channel_df.level.unique()):
        
        #For level 0, it first creates a topic tree column which is the title of that topic.            
        if level == 0:
            topic_tree = channel_df[channel_df["level"] == level]["title"].astype(str)
            topic_tree_df = pd.DataFrame([channel_df[channel_df["level"] == level][["id"]],topic_tree.values]).T
            topic_tree_df.columns = ["child_id","topic_tree"]
            channel_df = channel_df.merge(topic_tree_df, left_on = "id", right_on = "child_id", how = "left").drop(["child_id"], axis = 1)
        
        #Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id
        topic_df_parent = channel_df[channel_df["level"] == level][["id","title","parent","topic_tree"]]
        topic_df_parent.columns = "parent_" + topic_df_parent.columns
        
        topic_df_child = channel_df[channel_df["level"] == level + 1][["id","title","parent","topic_tree"]]
        topic_df_child.columns = "child_" + topic_df_child.columns
        
        topic_df_merged = topic_df_parent.merge(topic_df_child, left_on = "parent_id", right_on = "child_parent")[["child_id","parent_id","parent_title","child_title","parent_topic_tree"]]

        #Topic tree is parent topic tree + title of the current child on that level
        topic_tree = topic_df_merged["parent_topic_tree"].astype(str) + " >> " + topic_df_merged["child_title"].astype(str)
        
        topic_tree_df = pd.DataFrame([topic_df_merged["child_id"].values,topic_tree.values]).T
        topic_tree_df.columns = ["child_id","topic_tree"]
        
        channel_df = channel_df.merge(topic_tree_df, left_on = "id", right_on = "child_id", how = "left").drop(["child_id"], axis = 1)
        if "topic_tree_y" in list(channel_df.columns):
            channel_df["topic_tree"] = channel_df["topic_tree_x"].combine_first(channel_df["topic_tree_y"])
            channel_df = channel_df.drop(["topic_tree_x","topic_tree_y"], axis = 1)
        
    topics_df_topic_tree = pd.concat([topics_df_topic_tree,channel_df])

topics_df_topic_tree = topics_df_topic_tree.reset_index(drop = True)

topics_df_topic_tree.columns = ["topic_"+ column for column in topics_df_topic_tree.columns]
topics_df_topic_tree = topics_df_topic_tree.rename(columns = {"topic_topic_tree":"topic_tree"})

100%|██████████| 171/171 [00:14<00:00, 11.79it/s]


In [6]:
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?@\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?@>+', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\@', '', text)
    text = re.sub('\_', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [7]:
topics_df_topic_tree["topic_description"] = (topics_df_topic_tree["topic_description"].fillna("") + ". This is the topic tree " + topics_df_topic_tree["topic_tree"].fillna("")).progress_apply(clean_text)
content_df["content_description"] = (content_df["content_description"].fillna("") + ". " + content_df["content_text"].fillna("") + ". The title of the content is " + content_df["content_title"].fillna("")).progress_apply(clean_text)

100%|██████████| 76972/76972 [00:02<00:00, 31564.42it/s]
100%|██████████| 154047/154047 [01:46<00:00, 1446.91it/s]


In [8]:
topics_df_topic_tree = topics_df_topic_tree[topics_df_topic_tree["topic_has_content"]].reset_index(drop = True)
topics_df_topic_tree = topics_df_topic_tree[topics_df_topic_tree["topic_language"] == "en"].reset_index(drop = True)
content_df = content_df[content_df["content_language"] == "en"].reset_index(drop = True)

In [9]:
topic_ids = list(topics_df_topic_tree["topic_id"].values)
content_ids = list(content_df["content_id"].values)

In [10]:
def apk(y_true, y_pred):
    correct_predictions = 0
    running_sum = 0
    for i, yp_item in enumerate(y_pred):
        k = i+1 # our rank starts at 1

        if yp_item in y_true:
            correct_predictions += 1
            running_sum += correct_predictions/k

    return running_sum/len(y_true)

def f2_score(y_true, y_pred):
    y_true = set(y_true)
    y_pred = set(y_pred)
    tp = len(y_true.intersection(y_pred))
    fp = len(y_pred - y_true)
    fn = len(y_true - y_pred)
    
    # calculate the F2 score
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8*fn)
    
    return f2

In [11]:
model_evaluation = []
for model_path in tqdm(model_paths):
    
    print("-"*100)
    print(f"Model Running: {model_path}")
    
    # Initialize Model
    model = SentenceTransformer(f"sentence-transformers/{model_path}", device = device)
    
    # Create embeddings for content and topics
    content_vectors = model.encode(list(content_df["content_description"].values), show_progress_bar = True)
    topic_vectors = model.encode(topics_df_topic_tree["topic_description"], show_progress_bar = True)
    
    print("-"*20)
    print(f"UNN Running: {model_path}")

    # Fit nearest neighbors on content and find neighbors in content for topic
    nbrs = NearestNeighbors(n_neighbors = top_n, metric = "cosine").fit(content_vectors)    
    dist, nb = nbrs.kneighbors(topic_vectors)

    content_df_to_merge = content_df.reset_index()
    
    # Select top n recommendations
    pred_array_df = pd.DataFrame(nb).loc[:,:top_n-1]
    
    # Make it as a dataframe which has columns as predicted content details and topic id
    for i in range(top_n):
        pred_array_df = pred_array_df.merge(content_df_to_merge[["index","content_id"]].rename(columns = {"content_id":"content_id_"+str(i)}), left_on = i, right_on = "index", how = "left").drop(["index",i], axis = 1)
    
    pred_array = np.array(pred_array_df)

    pred_content_df = pd.DataFrame([list(pred_array)]).T
    pred_content_df = topics_df_topic_tree[["topic_id"]].merge(pred_content_df, right_index = True, left_index = True)
    pred_content_df.columns = ["topic_id","predicted_content"]
    
    #Merge to correlation dataframe to get true content ids
    pred_content_df = pred_content_df.merge(corr_df)
        
    # Create Evaluation Metrics
    pred_content_df["AP@k"] = pred_content_df.apply(lambda row: apk(row["true_content"],row["predicted_content"]), axis = 1)
    pred_content_df["F2_Score"] = pred_content_df.apply(lambda row: f2_score(row["true_content"],row["predicted_content"]), axis = 1)

    print("-"*20)
    print(f"Evaluation Running: {model_path}")

    print(f"Average Precision @k: {round(pred_content_df['AP@k'].mean()*100,2)}%")
    print(f"Average F2 Score:     {round(pred_content_df['F2_Score'].mean()*100,2)}%")
    
    model_evaluation.append({"Model": model_path,
                            "Average F2 Score":round(pred_content_df['F2_Score'].mean()*100,2),
                            "Average Precision @k":round(pred_content_df['AP@k'].mean()*100,2)})
    
    pred_content_df = pred_content_df.sort_values(["AP@k","F2_Score"], ascending = False)
    
    print(f"Writing predictions: {model_path}")
    pred_content_df.to_csv(f"/kaggle/working/prediction_df_{model_path}.csv", index = False)

  0%|          | 0/5 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------
Model Running: all-MiniLM-L6-v2


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/2061 [00:00<?, ?it/s]

Batches:   0%|          | 0/877 [00:00<?, ?it/s]

--------------------
UNN Running: all-MiniLM-L6-v2
--------------------
Evaluation Running: all-MiniLM-L6-v2
Average Precision @k: 21.04%
Average F2 Score:     19.72%
Writing predictions: all-MiniLM-L6-v2


 20%|██        | 1/5 [06:42<26:50, 402.65s/it]

----------------------------------------------------------------------------------------------------
Model Running: paraphrase-MiniLM-L3-v2


Downloading (…)d7125/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)90f41d7125/README.md:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading (…)f41d7125/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)d7125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)90f41d7125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)41d7125/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/2061 [00:00<?, ?it/s]

Batches:   0%|          | 0/877 [00:00<?, ?it/s]

--------------------
UNN Running: paraphrase-MiniLM-L3-v2
--------------------
Evaluation Running: paraphrase-MiniLM-L3-v2
Average Precision @k: 11.93%
Average F2 Score:     12.19%
Writing predictions: paraphrase-MiniLM-L3-v2


 40%|████      | 2/5 [12:06<17:48, 356.26s/it]

----------------------------------------------------------------------------------------------------
Model Running: multi-qa-MiniLM-L6-cos-v1


Downloading (…)5fedf/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2cb455fedf/README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading (…)b455fedf/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)edf/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5fedf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)fedf/train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading (…)2cb455fedf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)455fedf/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/2061 [00:00<?, ?it/s]

Batches:   0%|          | 0/877 [00:00<?, ?it/s]

--------------------
UNN Running: multi-qa-MiniLM-L6-cos-v1
--------------------
Evaluation Running: multi-qa-MiniLM-L6-cos-v1
Average Precision @k: 21.97%
Average F2 Score:     20.45%
Writing predictions: multi-qa-MiniLM-L6-cos-v1


 60%|██████    | 3/5 [20:57<14:32, 436.11s/it]

----------------------------------------------------------------------------------------------------
Model Running: multi-qa-mpnet-base-dot-v1


Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/2061 [00:00<?, ?it/s]

Batches:   0%|          | 0/877 [00:00<?, ?it/s]

--------------------
UNN Running: multi-qa-mpnet-base-dot-v1
--------------------
Evaluation Running: multi-qa-mpnet-base-dot-v1
Average Precision @k: 25.16%
Average F2 Score:     21.59%
Writing predictions: multi-qa-mpnet-base-dot-v1


 80%|████████  | 4/5 [48:34<15:18, 918.02s/it]

----------------------------------------------------------------------------------------------------
Model Running: multi-qa-mpnet-base-cos-v1


Downloading (…)e891a/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)92a80e891a/README.md:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

Downloading (…)a80e891a/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)91a/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)e891a/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)891a/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)92a80e891a/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)80e891a/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/2061 [00:00<?, ?it/s]

Batches:   0%|          | 0/877 [00:00<?, ?it/s]

--------------------
UNN Running: multi-qa-mpnet-base-cos-v1
--------------------
Evaluation Running: multi-qa-mpnet-base-cos-v1
Average Precision @k: 23.9%
Average F2 Score:     20.95%
Writing predictions: multi-qa-mpnet-base-cos-v1


100%|██████████| 5/5 [1:16:11<00:00, 914.29s/it] 


In [12]:
print('Model Evaluation:\n')
pd.DataFrame(model_evaluation)

Model Evaluation:



Unnamed: 0,Model,Average F2 Score,Average Precision @k
0,all-MiniLM-L6-v2,19.72,21.04
1,paraphrase-MiniLM-L3-v2,12.19,11.93
2,multi-qa-MiniLM-L6-cos-v1,20.45,21.97
3,multi-qa-mpnet-base-dot-v1,21.59,25.16
4,multi-qa-mpnet-base-cos-v1,20.95,23.9
