In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances_chunked


class DataFrameChunks:
    def __init__(self, generator):
        self.generator = generator
        self.n = 0
        
    def __next__(self) -> pd.DataFrame:
        chunk = next(self.generator)
        new_n = self.n + len(chunk)-1
        index = range(self.n, new_n+1)
        self.n = new_n + 1
        return pd.DataFrame(chunk, index=index)
    
    def __iter__(self):
        return self

In [2]:
def top_scores(row, row_date, dates, n=10):
    df = pd.DataFrame({
        'x_index': row.name,
        'score': row,
        'y_date':  dates
    })
    df = df[df['y_date'] < row_date]
    df.index.name = 'y_index'
    return df.sort_values('score').head(n).drop('y_date', axis=1).reset_index()  # type: ignore


def distance_generator(df, time_col, n_jobs=-1):
    return pairwise_distances_chunked(df.drop(time_col, axis=1), n_jobs=n_jobs)


def distances(df: pd.DataFrame, time_col: str, top_n=10, n_jobs=-1) -> pd.DataFrame:
    out = pd.DataFrame({'y_index': [], 'x_index': [], 'score': []})
    for chunk in distances_chunked(df, time_col, top_n, n_jobs):
        out = pd.concat([out, chunk])
    return out.reset_index(drop=True)


def distances_chunked(df: pd.DataFrame, time_col: str, top_n=10, n_jobs=-1):
    generator = distance_generator(df, time_col, n_jobs=n_jobs)
    df_generator = DataFrameChunks(generator)
    for _, chunk in enumerate(df_generator):
        for row_i, row in chunk.iterrows():
            scores = top_scores(row, df.iloc[row_i][time_col], df[time_col], n=top_n)  # type: ignore
            yield scores

In [11]:
df = pd.read_csv('../data/tcc_ceds_music.csv').set_index('Unnamed: 0').reset_index(drop=True)
numbers_df = df.drop(['topic', 'lyrics', 'genre', 'track_name', 'artist_name'], axis=1)
numbers_df

Unnamed: 0,release_date,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,...,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,age
0,1950,95,0.000598,0.063746,0.000598,0.000598,0.000598,0.048857,0.017104,0.263751,...,0.000598,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.137110,1.000000
1,1950,51,0.035537,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,...,0.001284,0.001284,0.001284,0.331745,0.647540,0.954819,0.000002,0.325021,0.263240,1.000000
2,1950,24,0.002770,0.002770,0.002770,0.002770,0.002770,0.002770,0.158564,0.250668,...,0.002770,0.002770,0.225422,0.456298,0.585288,0.840361,0.000000,0.351814,0.139112,1.000000
3,1950,54,0.048249,0.001548,0.001548,0.001548,0.021500,0.001548,0.411536,0.001548,...,0.081132,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.775350,0.743736,1.000000
4,1950,48,0.001350,0.001350,0.417772,0.001350,0.001350,0.001350,0.463430,0.001350,...,0.001350,0.068800,0.001350,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,2019,78,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,...,0.001350,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,0.014286
28368,2019,67,0.001284,0.001284,0.035338,0.001284,0.001284,0.001284,0.066324,0.203889,...,0.040811,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,0.014286
28369,2019,77,0.001504,0.154302,0.168988,0.001504,0.039755,0.001504,0.035401,0.001504,...,0.001504,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,0.014286
28370,2019,67,0.001196,0.001196,0.001196,0.001196,0.048359,0.001196,0.001196,0.001196,...,0.070867,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,0.014286


In [19]:
generator = pairwise_distances_chunked(numbers_df.drop('release_date', axis=1), n_jobs=-1)
index = 0
i = 0
chunk1 = next(generator)

count = len(chunk1)
indexes = range(index, count + index)
index = count
results1 = pd.DataFrame(chunk1, index=indexes)

chunk2 = next(generator)

count = len(chunk2)
indexes = range(index, count + index)
index = count
results2 = pd.DataFrame(chunk2, index=indexes)


results2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28362,28363,28364,28365,28366,28367,28368,28369,28370,28371
4730,93.020321,137.009732,164.007915,134.005222,140.009743,90.012379,9.094313,167.008315,158.007765,127.010529,...,130.002362,113.001478,100.001214,79.003457,127.002374,110.002871,121.002387,111.002227,121.001090,105.001993
4731,13.081431,31.026610,58.011688,28.019081,34.021808,16.032897,97.005980,61.013316,52.011278,21.021219,...,24.025793,7.068380,6.108079,27.020890,21.023098,4.120625,15.026513,5.080157,15.037425,1.540116
4732,22.073985,22.043213,49.020612,19.022069,25.031150,25.032665,106.005459,52.018522,43.019005,12.084989,...,15.035227,2.153803,15.032203,36.011463,12.020360,5.079521,6.040071,4.050234,6.067332,10.059119
4733,48.019158,4.151459,23.023967,7.108633,1.653068,51.013132,132.003710,26.024493,17.044986,14.064601,...,11.065662,28.019169,41.019608,62.011467,14.055950,31.021517,20.032385,30.018006,20.029398,36.015912
4734,45.018579,1.451904,26.017840,4.323515,2.333219,48.015945,129.004377,29.020247,20.026265,11.073600,...,8.161674,25.046474,38.037639,59.022194,11.130990,28.044959,17.070304,27.040540,17.075793,33.035178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9455,65.014055,21.022723,6.069957,24.021994,18.027218,68.004425,149.002401,9.053571,0.891539,31.015733,...,28.024456,45.012812,58.015224,79.009443,31.022533,48.013312,37.015519,47.010099,37.020170,53.013311
9456,2.608788,46.022787,73.012691,43.005663,49.019550,1.521896,82.007307,76.013275,67.012325,36.019604,...,39.011927,22.014787,9.051796,12.035809,36.008579,19.019765,30.008299,20.014035,30.014329,14.040086
9457,43.037639,1.773981,28.033351,2.190006,4.247349,46.015355,127.006223,31.035987,22.040612,9.097586,...,6.107702,23.020943,36.018326,57.006836,9.046940,26.014502,15.023152,25.015319,15.037470,31.024686
9458,36.023602,8.049566,35.012311,5.118229,11.038583,39.015081,120.002965,38.009269,29.017286,2.255651,...,1.652575,16.050898,29.037220,50.018294,2.434808,19.042311,8.107128,18.038762,8.105977,24.031999


In [45]:
def get_scores(row, n=10) -> list[dict]:
    return [
        {'x_index': row.name,
         'y_index': i,
         'score': score}
            for i, score in row.sort_values().drop(row.name).head(n).items()]


def distances(numbers_df: pd.DataFrame, date_col: str, top_n=10, n_jobs=-1) -> pd.DataFrame:
    scores = []
    generator = pairwise_distances_chunked(numbers_df.drop(date_col, axis=1), n_jobs=-1)
    index = 0

    for chunk in generator:
        count = len(chunk)
        indexes = range(index, count + index)
        index = count + index
        results = pd.DataFrame(chunk, index=indexes)
        for i, row in results.iterrows():
            scores.extend(get_scores(row))

        print(len(scores), 'done')
    return pd.DataFrame(scores)
    
distances(numbers_df, 'release_date')

47300 done
94600 done
141900 done
189200 done
236500 done
283720 done


Unnamed: 0,x_index,y_index,score
0,0,19210,0.889176
1,0,933,0.903650
2,0,21029,0.997782
3,0,19721,1.010673
4,0,13108,1.029608
...,...,...,...
283715,28371,12191,0.523668
283716,28371,22933,0.538553
283717,28371,27822,0.539979
283718,28371,27909,0.545113


In [41]:
len(scores)

283720

In [43]:
pd.DataFrame(scores)

Unnamed: 0,x_score,y_index,score
0,0,19210,0.889176
1,0,933,0.903650
2,0,21029,0.997782
3,0,19721,1.010673
4,0,13108,1.029608
...,...,...,...
283715,28371,12191,0.523668
283716,28371,22933,0.538553
283717,28371,27822,0.539979
283718,28371,27909,0.545113
