In [1]:
import os
import sqlite3 as sqlite
import numpy as np
import pandas as pd
import pickle
import scipy
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV



This notebook works when starting like this:
```bash
cd mag_sample
conda activate science-career-tempenv
env PYTHONPATH=/absolute/path/to/mag_sample/src/dataprep/ juptyer-lab --no-browser
```

or probably with VS code

### Load data

In [2]:
os.getcwd()

'/home/flavio/repositories/mag_sample/src/dataprep/main/link'

In [3]:
from main.link.fit_svd_model import make_sparse, run_svd
from helpers.variables import db_file
max_level = 2


In [4]:
sqlite.register_adapter(np.int64, lambda val: int(val))
con = sqlite.connect(database=db_file, isolation_level=None)

In [5]:
con.execute(
        """CREATE TEMP TABLE fields_to_max_level AS
        SELECT FieldOfStudyId
        FROM FieldsOfStudy
        WHERE Level > 0 AND Level <= (?)
        """
        , (max_level,)
        )
con.execute("CREATE UNIQUE INDEX idx_temp1 ON fields_to_max_level(FieldOfStudyId ASC)") # TODO: need to query this without overwriting the table `valid_papers`

<sqlite3.Cursor at 0x7223db683a40>

In [6]:
sql_papers = """
SELECT PaperId, FieldOfStudyId, Score
FROM PaperFieldsOfStudy
INNER JOIN (
    SElECT PaperId
    FROM valid_papers
    ORDER BY RANDOM()
    LIMIT 100000
)
USING (PaperId)
INNER JOIN fields_to_max_level USING(FieldOfStudyId)
"""

In [7]:
papers_concepts = pd.read_sql(sql_papers, con)

In [8]:
papers_concepts["PaperId"].nunique()

91915

In [9]:
assert papers_concepts.drop_duplicates(subset=["PaperId", "FieldOfStudyId"]).shape == papers_concepts.shape

In [10]:
fields_of_study = pd.read_sql("SELECT * FROM fields_to_max_level ORDER BY FieldOfStudyId", con=con) # TODO: refactor this; add it do the function that loads the data?

In [11]:
field_to_index = {id: index for index, id in enumerate(fields_of_study['FieldOfStudyId'].unique())}


### Load models

In [12]:
emb_dims = [2**i for i in range(4,11)]
emb_dims

[16, 32, 64, 128, 256, 512, 1024]

In [13]:
models = {}
for ed in emb_dims:
    model_url = f"/mnt/ssd/AcademicGraph/svd_model_{ed}"
    with open(model_url + ".pkl", "rb") as f:
        model = pickle.load(f)
    models[ed] = model

In [14]:
models.keys()

dict_keys([16, 32, 64, 128, 256, 512, 1024])

### Compute embeddings

In [15]:
papers_concepts_sparse, row_to_index = make_sparse(
    papers_concepts, field_to_index, "PaperId", "FieldOfStudyId", "Score")


(462159,)
<91915x137480 sparse matrix of type '<class 'numpy.float64'>'
	with 462159 stored elements in Compressed Sparse Row format>


In [16]:
embeddings = {}
for ed, model in tqdm(models.items()):
    print(f"Processing model dimension {ed}")
    _, embs = run_svd(papers_concepts_sparse, ed, model) # TODO: need to refactor! n_components not necessary! also don't print out the fitting stats when just transforming
    embeddings[ed] = embs

  0%|                                                                            | 0/7 [00:00<?, ?it/s]

Processing model dimension 16
Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 16)
Explained variance ratio: 0.0600
Processing model dimension 32


 29%|███████████████████▍                                                | 2/7 [00:00<00:00, 10.38it/s]

Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 32)
Explained variance ratio: 0.0967
Processing model dimension 64
Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 64)
Explained variance ratio: 0.1519
Processing model dimension 128


 57%|██████████████████████████████████████▊                             | 4/7 [00:00<00:00,  4.00it/s]

Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 128)
Explained variance ratio: 0.2223
Processing model dimension 256


 71%|████████████████████████████████████████████████▌                   | 5/7 [00:01<00:00,  2.28it/s]

Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 256)
Explained variance ratio: 0.3105
Processing model dimension 512


 86%|██████████████████████████████████████████████████████████▎         | 6/7 [00:03<00:00,  1.26it/s]

Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 512)
Explained variance ratio: 0.4208
Processing model dimension 1024


100%|████████████████████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.05it/s]

Original matrix shape: (91915, 137480)
Reduced matrix shape: (91915, 1024)
Explained variance ratio: 0.5472





In [17]:
papers_concepts.head()
paper_ids = papers_concepts["PaperId"].unique()


### Sanity checks

#### KNN classification check

In [18]:
q_paper_id = [x for x in paper_ids]
qmarks_paper_id = ",".join(["?" for _ in paper_ids])

In [19]:
sql_main_field = f"""
SELECT PaperId, Field0
FROM PaperMainFieldsOfStudy
WHERE PaperId IN ({qmarks_paper_id})"""

In [20]:
papers_main_field = pd.read_sql(sql_main_field, con, params=q_paper_id)

In [21]:
papers_main_field.head()

Unnamed: 0,PaperId,Field0
0,23624,142362112
1,269105,86803240
2,400980,41008148
3,444879,192562407
4,452971,185592680


In [22]:
param_grid = {
    "n_neighbors": list(range(1,5)) + list(range(5,50,5))
}
classifier = KNeighborsClassifier()


In [23]:
labels = papers_main_field["Field0"].values
paper_ids = papers_main_field["PaperId"].values

x_full = []
for paperid in paper_ids:
    row_index = row_to_index[paperid]
    x_full.append(papers_concepts_sparse[row_index, :])

x_full = scipy.sparse.vstack(x_full)


In [24]:

# fit on full embeddings
selector_full = GridSearchCV(classifier, param_grid)
selector_full.fit(x_full, labels)
print(f"Best score for full embeddings: {selector_full.best_score_:.{3}f} with {selector_full.best_params_}")


KeyboardInterrupt: 

In [None]:
for dim, current_embeddings in embeddings.items():
    print(f"Embedding dimension: {dim}")
    x_reduced = []
    
    for paperid in paper_ids:
        row_index = row_to_index[paperid]
        x_reduced.append(current_embeddings[row_index, :])
    
    x_reduced = np.array(x_reduced)
    
    # fit on reduced embeddings 
    selector_reduced = GridSearchCV(classifier, param_grid)
    selector_reduced.fit(x_reduced, labels)

    print(f"Best score for reduced embeddings: {selector_reduced.best_score_:.{3}f} with {selector_reduced.best_params_}")
    print("=================================================================")

        

#### Cosine similarity check

In [None]:
def cosine_similarity(m, i, j):
    a = m[i]
    b = m[j]
    ab = (a @ b.T)
    if isinstance(ab, scipy.sparse._csr.csr_matrix):
        ab = ab.toarray()
        a = a.toarray()
        b = b.toarray()

    denominator = (np.power(a, 2).sum() * np.power(b, 2).sum() + 0.001)
    denominator = np.sqrt(denominator)
    return ab / denominator


In [None]:
sample_size = 50_000
similarities = {}

In [None]:
# draw *same* pairs for both full and reduced
for dim, current_embeddings in tqdm(embeddings.items()):
    sim_reduced = []
    sim_full = []
    for _ in range(sample_size):
        n_vectors = papers_concepts_sparse.shape[0]
        i, j = 0, 0
        while i == j:
            i, j = np.random.choice(np.arange(n_vectors), 2)
        sim = cosine_similarity(papers_concepts_sparse, i, j)
        sim_full.append(sim[0][0])
        sim = cosine_similarity(current_embeddings, i, j)
        sim_reduced.append(sim)

    sim_reduced= np.array(sim_reduced)
    sim_full = np.array(sim_full)
    data = {
        "reduced": sim_reduced,
        "full": sim_full
    }
    similarities[dim] = data


In [None]:
for dim, data in similarities.items():
    plt.figure(figsize=(10, 6))
    plt.hist(data["full"], bins=30, alpha=0.7, color='blue', label='Full')
    plt.hist(data["reduced"], bins=30, alpha=0.7, color='red', label='Reduced')
    # Customize the plot
    plt.title(f'{dim} dimensions')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    
    # Add legend
    plt.legend()
    
    # Display the plot
    plt.show()
    

In [None]:
con.close()