In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import torch
from tqdm.auto import tqdm

from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Import
PATH = 'data'
content = pd.read_csv(f'{PATH}/content_filtered.csv')
correlation = pd.read_csv(f'{PATH}/correlations.csv')
topics = pd.read_csv(f'{PATH}/topics_filtered.csv')

In [5]:
content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00019840d110,5.12E: Regulation of the Calvin Cycle,,html5,LEARNING OBJECTIVES\n\nOutline the three major...,en,CSU and Merlot,CC BY-NC-SA
1,c_00027d03ca7d,4.E: Genomes and Chromosomes (Exercises),,html5,4.3\n\n(BPA) Answer the following questions wi...,en,CSU and Merlot,CC BY-NC-SA
2,c_000425df0161,9.3: Van Der Waals Forces between Atoms,,html5,Introduction\n\nThe perfect gas equation of st...,en,CSU and Merlot,CC BY-NC-SA
3,c_00046806ad8a,Compare multi-digit numbers,Use your place value skills to practice compar...,exercise,,en,,
4,c_000751f58836,Tangents of circles problem (example 2),Sal finds a missing angle using the property t...,video,Angle A is a circumscribed angle on circle O. ...,en,Khan Academy,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
65934,c_fffa90b024e3,Congruent Triangle Proof Example,,video,,en,,
65935,c_fffbfc3d60c6,number_lines_mixed_positive_student writes num...,This Number Line Worksheet includes problems f...,document,,en,,
65936,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
65937,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


In [6]:
correlation

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4
...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a


In [7]:
topics

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
1,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
2,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
3,t_000c0c854f0b,Nepal,,f2e966,source,1,en,t_f9c4ef0d6290,False
4,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True
...,...,...,...,...,...,...,...,...,...
36156,t_fff7782561f4,Introduction,"In certain situations, comparison by division ...",d5fb04,supplemental,3,en,t_2a4dc28b0431,True
36157,t_fff80f4eee89,Polynomial division,"After we have added, subtracted, and multiplie...",6b09a4,supplemental,5,en,t_e93f4fad3893,False
36158,t_fff830472691,Scalar Projections,,fef095,source,4,en,t_c75d6acecf78,True
36159,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False


In [8]:
# Model Import
MODEL = 'sentence-transformers/paraphrase-MiniLM-L12-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModel.from_pretrained(MODEL)
model.eval()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading (…)lve/main/config.json: 100%|██████████| 631/631 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:14<00:00, 9.00MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 316/316 [00:00<?, ?B/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.10MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.00MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 13.9kB/s]


In [9]:

vecs = []
MAX_LEN = 384
# ELEMENT = content['title']

for _,row in tqdm(content.iterrows(), total=len(content)):
  # print(content['title'][row])
  title = row['title']
  if type(title) is float:
    title = row['description']
  if type(title) is float:
    title = row['text']

  tok = tokenizer(title)
  for k,v in tok.items():
    tok[k] = torch.tensor(v[:MAX_LEN]).to(device).unsqueeze(0)
  with torch.no_grad():
    output = model(**tok)

  vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
  vecs.append(vec)

vecs1 = torch.stack(vecs)

  2%|▏         | 1448/65939 [00:25<12:19, 87.17it/s] 