In [58]:
import pandas as pd
import random
from dotenv import load_dotenv
import os
import requests
import torch
load_dotenv()

True

In [4]:
# goal: create a train/test dataset using positive and negative pairs
df = pd.read_csv("../raw_data/tunes.csv")

In [None]:
# Create positive pairs
positive_pairs = []
for tune_id, group in df.groupby('tune_id'):
    abc_list = group[['setting_id', 'abc']].values.tolist()
    # create one pair per tune (this seems like a decent enough way to start)
    if len(abc_list) > 1:
        pair = random.sample(abc_list, 2)
        positive_pairs.append((tune_id, tune_id, pair[0][0], pair[1][0], pair[0][1], pair[1][1]))


In [None]:
# Create negative pairs - just the same amount as in the positive set
abc_list = df[['tune_id', 'setting_id', 'abc']].values.tolist()
negative_pairs = []
while len(negative_pairs) < len(positive_pairs):
    pair = random.sample(abc_list, 2)
    if pair[0][0] != pair[1][0]:
        negative_pairs.append((pair[0][0], pair[1][0], pair[0][1], pair[1][1], pair[0][2], pair[1][2]))

In [None]:
# Create dataframes
positive_df = pd.DataFrame(positive_pairs, columns=['tune_a', 'tune_b', 'setting_a', 'setting_b', 'abc_a', 'abc_b'])
negative_df = pd.DataFrame(negative_pairs, columns=['tune_a', 'tune_b', 'setting_a', 'setting_b', 'abc_a', 'abc_b'])

In [28]:
positive_df.to_csv("../raw_data/positive_set.csv")

In [29]:
negative_df.to_csv("../raw_data/negative_set.csv")

In [52]:
# time to try doing an embedding, let's start with huggingface
HF_TOKEN = os.getenv("HUGGING_FACE_TOKEN")
model_id = "sentence-transformers/all-MiniLM-L6-v2"

In [53]:
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

In [54]:
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [56]:
results = query(list(positive_pairs[0][-2:]))

In [57]:
results

[[0.010539747774600983,
  -0.039622899144887924,
  -0.023290928453207016,
  -0.037916626781225204,
  -0.04009198397397995,
  0.022174304351210594,
  0.06749163568019867,
  -0.013118957169353962,
  0.024522650986909866,
  0.013274047523736954,
  0.09884174168109894,
  -0.07426027953624725,
  0.052353713661432266,
  -0.04639938473701477,
  -0.010557710193097591,
  0.05662812665104866,
  -0.024795029312372208,
  0.0932144746184349,
  -0.09591826051473618,
  -0.07031131535768509,
  0.05302370339632034,
  0.1039707362651825,
  0.07383810728788376,
  -0.06243801489472389,
  0.023195937275886536,
  0.05059167370200157,
  -0.04786936566233635,
  0.05661751329898834,
  -0.026296110823750496,
  -0.12299317866563797,
  0.011391131207346916,
  0.0523269847035408,
  0.10803442448377609,
  0.062014028429985046,
  0.001643180032260716,
  -0.060002151876688004,
  -0.0021213486324995756,
  0.015373241156339645,
  0.03926711529493332,
  0.06314399093389511,
  -0.030129946768283844,
  -0.0840356424450874

In [60]:
embeddings = pd.DataFrame(results)