In [None]:
# raw files paths
saving_data_path = "/home/etaylor/code_projects/track2vec/data"
evalrs_events_raw = "/home/etaylor/.cache/evalrs/LFM-1b_LEs.txt"
evalrs_tracks_raw = "/home/etaylor/.cache/evalrs/LFM-1b_tracks.txt"
evalrs_users_raw = "/home/etaylor/.cache/evalrs/LFM-1b_users.txt"

# data headers
events_headers = ['user_id', 'artist_id', 'album_id', 'track_id', 'timestamp']
tracks_headers = ['track_id', 'track_name', 'artist_id']
users_headers = ['user_id', 'country', 'age', 'gender', 'playcount', 'timestamp']

# processed files paths
users_filepath = f"{saving_data_path}/evalrs_users.csv"
tracks_filepath = f"{saving_data_path}/evalrs_tracks.csv"
events_filepath = f"{saving_data_path}/evalrs_events.csv"
output_filepath = f'{saving_data_path}/final_training_dataset.csv'

In [3]:
import pandas as pd

def extract_relevant_events_data(events_filepath, users_filepath, output_filepath, chunksize=500000, verbose=False, max_size_gb=1):
    users_headers = ['user_id', 'country', 'age', 'gender', 'playcount', 'registered']
    events_headers = ['user_id', 'artist_id', 'album_id', 'track_id', 'timestamp']
    
    # Load the users data with the correct headers
    users_df = pd.read_csv(users_filepath, names=users_headers, skiprows=1, sep='\t')
    user_ids = set(users_df['user_id'])

    if verbose:
        print("Loaded users data:")
        print(users_df.head())

    # Initialize an empty DataFrame to accumulate relevant events
    relevant_events_df = pd.DataFrame()

    chunk_counter = 0
    total_size_bytes = 0
    max_size_bytes = max_size_gb * 1024 ** 3  # Convert GB to bytes

    for chunk in pd.read_csv(events_filepath, chunksize=chunksize, sep='\t', names=events_headers, skiprows=1):
        filtered_chunk = chunk[chunk['user_id'].isin(user_ids)]
        total_size_bytes += filtered_chunk.memory_usage(deep=True).sum()
        
        # If the total size exceeds the limit, save the current data and reset
        if total_size_bytes >= max_size_bytes:
            temp_output_path = f"{output_filepath}_part_{chunk_counter}.csv"
            relevant_events_df.to_csv(temp_output_path, index=False, mode='a', header=not os.path.exists(temp_output_path))
            if verbose:
                print(f"Saved partial relevant events data to {temp_output_path} and reset the DataFrame")
            
            # Reset for the next iteration
            relevant_events_df = pd.DataFrame()
            total_size_bytes = 0

        relevant_events_df = pd.concat([relevant_events_df, filtered_chunk])
        
        if verbose:
            print(f"Processed chunk {chunk_counter}.")
        
        chunk_counter += 1

    # Save any remaining data to disk
    if not relevant_events_df.empty:
        temp_output_path = f"{output_filepath}_part_{chunk_counter}.csv"
        relevant_events_df.to_csv(temp_output_path, index=False, mode='a', header=not os.path.exists(temp_output_path))
        if verbose:
            print(f"Saved final part of relevant events data to {temp_output_path}")

    if verbose:
        print("Finished processing all chunks.")

In [7]:
extract_relevant_events_data(evalrs_events_raw, evalrs_users_raw, output_filepath, verbose=True)

Loaded users data:
   user_id country  age gender  playcount  registered
0      384      UK   35      m      42139  1035849600
1     1206     NaN   -1      n      33103  1035849600
2     2622     NaN   -1    NaN       2030  1037404800
3     2732     NaN   -1      n        147  1037577600
4     3653      UK   31      m      18504  1041033600
Processed chunk 0.
Processed chunk 1.
Processed chunk 2.
Processed chunk 3.
Processed chunk 4.
Processed chunk 5.
Processed chunk 6.
Processed chunk 7.
Processed chunk 8.
Processed chunk 9.
Processed chunk 10.
Processed chunk 11.
Processed chunk 12.
Processed chunk 13.
Processed chunk 14.
Processed chunk 15.
Processed chunk 16.
Processed chunk 17.
Processed chunk 18.
Processed chunk 19.
Processed chunk 20.
Processed chunk 21.
Processed chunk 22.
Processed chunk 23.
Processed chunk 24.
Processed chunk 25.
Processed chunk 26.
Processed chunk 27.
Processed chunk 28.
Processed chunk 29.
Processed chunk 30.
Processed chunk 31.
Processed chunk 32.
Process

KeyboardInterrupt: 

In [26]:
import pandas as pd

def create_training_dataset(events_filepath, users_filepath, output_filepath, verbose=False):
    # Load the events data
    events_df = pd.read_csv(events_filepath)
    if verbose:
        print("Events DataFrame head:\n", events_df.head())

    # Aggregate track IDs into a list per user
    user_tracks_list_df = events_df.groupby('user_id')['track_id'].apply(list).reset_index(name='track_id')
    if verbose:
        print("User-Tracks List DataFrame head:\n", user_tracks_list_df.head())

    # Count the distinct number of tracks each user has listened to
    user_track_distinct_count_df = events_df.groupby('user_id')['track_id'].nunique().reset_index(name='distinct_track_count')
    if verbose:
        print("User-Distinct Track Count DataFrame head:\n", user_track_distinct_count_df.head())

    # Merge the track IDs list and distinct track count with the users data
    users_df = pd.read_csv(users_filepath)
    if verbose:
        print("Users DataFrame head:\n", users_df.head())

    # Merge users data with track IDs list
    users_with_tracks_df = users_df.merge(user_tracks_list_df, on='user_id', how='left')
    
    # Merge the above result with the distinct track count
    final_df = users_with_tracks_df.merge(user_track_distinct_count_df, on='user_id', how='left')
    if verbose:
        print("Users with Track IDs and Distinct Counts DataFrame head:\n", final_df.head())

    # Drop duplicates to avoid repetition for each user
    final_df = final_df.drop_duplicates(subset='user_id')

    # Select the columns of interest
    final_df = final_df[['user_id', 'playcount', 'gender', 'distinct_track_count', 'track_id']]
    if verbose:
        print("Final DataFrame head:\n", final_df.head())
        
    # Drop records with missing values if necessary
    final_df = final_df.dropna(axis=0)

    # Save the final dataframe to a CSV file
    final_df.to_csv(output_filepath, index=False)

    return final_df


In [27]:
events_subset_path = "/home/etaylor/code_projects/track2vec/data/events_part_107.csv"

final_dataset = create_training_dataset(events_subset_path, users_filepath, output_filepath, verbose=True)

Events DataFrame head:
     user_id  artist_id  album_id  track_id   timestamp
0  20926447       3983     10756    789533  1353509220
1  20926447       3983     10756     30561  1353509011
2  20926447       3983      8428    789830  1353508616
3  20926447       3983      8428    789384  1353508441
4  20926447       3983      8428    789672  1353508189
User-Tracks List DataFrame head:
    user_id                                           track_id
0    10879  [5011085, 5011086, 5011087, 5011088, 5011089, ...
1  1002693  [3267123, 3267124, 3826841, 850917, 850918, 44...
2  1022229  [4070595, 1989184, 102683, 517342, 5208, 40705...
3  1026493  [5245580, 28025, 952133, 5245581, 5245582, 524...
4  1048016  [637019, 11967, 240455, 1008317, 411879, 10083...
User-Distinct Track Count DataFrame head:
    user_id  distinct_track_count
0    10879                  1454
1  1002693                  8220
2  1022229                  5509
3  1026493                 17157
4  1048016                  6787

In [29]:
print(f"Number of records {len(final_dataset)}")
final_dataset.head()

Number of records 2323


Unnamed: 0,user_id,playcount,gender,distinct_track_count,track_id
13,10879,147520,n,1454.0,"[5011085, 5011086, 5011087, 5011088, 5011089, ..."
33,1002693,69223,n,8220.0,"[3267123, 3267124, 3826841, 850917, 850918, 44..."
73,1022229,91248,m,5509.0,"[4070595, 1989184, 102683, 517342, 5208, 40705..."
85,1026493,13102,m,17157.0,"[5245580, 28025, 952133, 5245581, 5245582, 524..."
149,1048016,22963,m,6787.0,"[637019, 11967, 240455, 1008317, 411879, 10083..."


: 