In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import torch
import cupy as cp

from tqdm.auto import tqdm
from sklearn.metrics import pairwise_distances
# from cuml.metrics import pairwise_distances

from transformers import AutoModel, AutoTokenizer

In [42]:
# Import
PATH = 'data'
content = pd.read_csv(f'{PATH}/content.csv')
correlation = pd.read_csv(f'{PATH}/correlations.csv')
topics = pd.read_csv(f'{PATH}/topics.csv')
submission = pd.read_csv(f'{PATH}/sample_submission.csv')

In [51]:
content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


In [52]:
correlation

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4
...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a


In [53]:
topics

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
...,...,...,...,...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True


In [43]:
# Model Import
MODEL = 'sentence-transformers/paraphrase-MiniLM-L12-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModel.from_pretrained(MODEL)
model.eval()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [44]:

vecs = []
MAX_LEN = 384
# ELEMENT = content['title']

for _,row in tqdm(content.iterrows(), total=len(content)):
  # print(content['title'][row])
  title = row['title']
  if type(title) is float:
    title = row['description']
  if type(title) is float:
    title = row['text']

  tok = tokenizer(title)
  for k,v in tok.items():
    tok[k] = torch.tensor(v[:MAX_LEN]).to(device).unsqueeze(0)
  with torch.no_grad():
    output = model(**tok)

  vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
  vecs.append(vec)

vecs1 = torch.stack(vecs)

 23%|██████████████████▎                                                           | 36139/154047 [05:54<18:29, 106.27it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (17769 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████| 154047/154047 [24:52<00:00, 103.20it/s]


In [45]:
sub_topics_ids = submission['topic_id'].tolist()
_topics = topics.query(f'id in {sub_topics_ids}')

In [46]:
vecs = []
for _,row in tqdm(_topics.iterrows(), total=len(_topics)):
  # print(content['title'][row])
  title = row['title']
  if type(title) is float:
    title = row['description']
  if type(title) is float:
    title = row['This content contains no text.']

  tok = tokenizer(title)
  for k,v in tok.items():
    tok[k] = torch.tensor(v[:MAX_LEN]).to(device).unsqueeze(0)
  with torch.no_grad():
    output = model(**tok)

  vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
  vecs.append(vec)

vecs2 = torch.stack(vecs)

100%|████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 85.71it/s]


In [47]:
vecs1 = cp.asarray(vecs1)
vecs2 = cp.asarray(vecs2)

# Prediction

In [23]:
# Using CPU takes a lot of time about 25-30 mins atleast

# prediction = []
# for v2 in vecs2:
#     sim = pairwise_distances(v2.reshape(1, len(v2)).get(), vecs1.get(), metric='cosine')
#     predict = " ".join([content.loc[s, 'id'] for s in sim.argsort(1)[0, :5].get()])
#     prediction.append(predict)

In [48]:
# Using Cuda
vecs1_tensor = torch.tensor(vecs1).to('cuda')  # Convert vecs1 to a tensor and move to the GPU
vecs2_tensor = torch.tensor(vecs2).to('cuda')  # Convert vecs2 to a tensor and move to the GPU

prediction = []

for v2 in tqdm(vecs2_tensor, desc="Processing vecs2"):
    sim = torch.cdist(v2.view(1, -1), vecs1_tensor, p=2)  # Compute pairwise cosine distances
    _, indices = torch.topk(sim, 5, largest=False)  # Find the indices of the top 5 smallest distances
    indices = indices.cpu().numpy()  # Move the indices back to the CPU and convert to a numpy array
    
    predict = " ".join([content.loc[s, 'id'] for s in indices[0]])
    prediction.append(predict)

Processing vecs2: 100%|██████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 97.08it/s]


In [49]:
submission['content_ids'] = prediction
submission.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_f035492c1930 c_5501a538f1c2 c_d35077f2c3d3 c...
1,t_00068291e9a4,c_e88be716634d c_70641ff97b45 c_b0818f9ef8f6 c...
2,t_00069b63a70a,c_a6db0765d460 c_430ddb6fbfa4 c_07e16f3aabbf c...
3,t_0006d41a73a8,c_a9199faabb91 c_e70a97d3154a c_1e23a649ef33 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231 c_80dfc28d7bf2 c...


In [50]:
submission.to_csv('submission.csv', index=None)