Read from music dataset and write it into 

In [None]:
!pip install pandas

In [None]:
# make working directory if not present
!mkdir -p raw/datasets/
%cd ./raw/datasets/
# fetch datasource for 
!wget -N https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/CDs_and_Vinyl.jsonl.gz
!wget -N https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_CDs_and_Vinyl.jsonl.gz

In [1]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

# add filter during loading
def getDF(path, limit=-1, filter=None):
  i = 0
  df = {}
  for d in parse(path):
    if(limit >0):
      if(i>=limit):
        break
    if(filter and filter(d)):
      #print(d["title"] + "|"+str(d["average_rating"])+"|"+str(d["rating_number"]))
      continue
    else:
      df[i] = d
      i += 1

  return pd.DataFrame.from_dict(df, orient='index')

In [2]:
def filterByRating(data, average_rating=3, rating_number=20):
    if(data['average_rating'] and  data['rating_number']):
        return data['average_rating']< average_rating or data['rating_number'] < rating_number
    else:
        return False

def filterByASIN(asin_list=[]):
    def filterInList(data):
        if(data['asin']):
            return (data['asin'] not in asin_list)
        else:
            return True
    return filterInList

def filterByParentASIN(asin_list=[]):
    def filterInList(data):
        if(data['parent_asin']):
            return (data['parent_asin'] not in asin_list)
        else:
            return True
    return filterInList
  

In [3]:
FIELD_CATEGORY = 'categories' #for dataset older than 2023, use 'category'
FIELD_JOIN_KEY = 'parent_asin' #for dataset older than 2023, use 'asin'

RATINGS_FILE_NAME = "ratings.dat"
CONTENT_FILE_NAME = "music.dat"

SEPERATOR = ";"

# making working directory for music
!mkdir -p music
%pwd

'c:\\Users\\yash_pc\\Desktop\\Laurier\\Spring Term\\Android app programming\\Project\\Practice\\CP670-Project_updated\\CP670-Project\\ml\\data'

In [5]:
import re
# Step 1: Read the json file into a DataFrame
input_file = 'meta_CDs_and_Vinyl.jsonl.gz'
df = getDF(input_file, 1000, filterByRating) #enabled for 2023 data
# set music_id according to the since the model requires sequence for music_id
# df["music_id"] = df["parent_asin"].apply(hashToInt)
df.sort_values(by="rating_number")
df["music_id"] = df.reindex().index +1
# Step 2: 
# Filter out default genres
genres_to_be_filtered = ["CDs & Vinyl"]
df[FIELD_CATEGORY] = df[FIELD_CATEGORY].apply(lambda row: [ele for ele in row if ele not in genres_to_be_filtered])
# Filter rows where 'category' is non-empty or an empty array
filtered_content = df[df[FIELD_CATEGORY].apply(lambda row: isinstance(row, list) and len(row) > 0)]
# Show the filtered DataFrame
print(filtered_content[FIELD_CATEGORY].count())

# tokenize the list in 'category' and concat them with '|' as a sepa
filtered_content['genres'] = filtered_content[FIELD_CATEGORY].apply(lambda row: '|'.join(map(str, (row + [item.strip() for sublist in [re.split(r" & | ",ele) for ele in row] for item in sublist]))))

# Step 3: Pick necessary columns and save to csv file
selected_content = ["music_id", "title", "genres"]#enabled for 2023 data
df_content = filtered_content[selected_content]
content_file = 'music/{file_name}'.format(file_name = CONTENT_FILE_NAME)
df_content.to_csv(content_file, sep=SEPERATOR, index=False, header=False)

1000


In [6]:
# # Step 1: Read the json file into a DataFrame
input_file = 'CDs_and_Vinyl.jsonl.gz'

# step 2: Preprocess the data 
asin_list=filtered_content[FIELD_JOIN_KEY].to_list()#enabled for 2023 data
rating_sum = filtered_content["rating_number"].sum()
print(f"Fetch comments for  {len(asin_list)} products and supposed ratings to be recieved= {rating_sum}")
df = getDF(input_file,rating_sum, filterByParentASIN(asin_list))#enabled for 2023 data
# TODO: sum the ratings by parent_asin to see the different in rating_number and actually ratings counted

df = df.set_index(FIELD_JOIN_KEY).join(filtered_content.set_index(FIELD_JOIN_KEY), rsuffix="meta_")

# Step 3: Pick necessary columns and save to csv 
selected_rating = ["user_id", "music_id", "rating","timestamp"] #enabled for 2023 data
df_rating = df[selected_rating]
rating_file = 'music/{file_name}'.format(file_name = RATINGS_FILE_NAME)
df_rating.to_csv(rating_file, sep=SEPERATOR, index=False, header=False)

Fetch comments for  1000 products and supposed ratings to be recieved= 244726


In [7]:
import os
RATINGS_DATA_COLUMNS = ["UserID", "MovieID", "Rating", "Timestamp"]
MOVIES_DATA_COLUMNS = ["MovieID", "Title", "Genres"]

print("First few lines for ratings and movies:")

# read the data 
ratings_df = pd.read_csv(
    os.path.join(os.curdir, "music",RATINGS_FILE_NAME),
    sep=SEPERATOR,
    names=RATINGS_DATA_COLUMNS,
    encoding="unicode_escape")

print(ratings_df.head())

movies_df = pd.read_csv(
    os.path.join(os.curdir, "music", CONTENT_FILE_NAME),
    sep=SEPERATOR,
    names=MOVIES_DATA_COLUMNS,
    encoding="unicode_escape")  # May contain unicode. Need to escape.

print(movies_df.head())

First few lines for ratings and movies:
                         UserID  MovieID  Rating      Timestamp
0  AEVQ3KP55X4XECXWMHN6DHIDBYFQ      179     5.0  1406805139000
1  AFAIJYOUO3NAWLBDIKTQSC3DASWA      541     5.0  1149955013000
2  AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ      981     5.0  1524455984589
3  AE7BV6IMNPZ3F266H7PXMH3BZQNQ      266     5.0  1637093229001
4  AGEFGQNUBPAGS4JYBD664PSZKDNQ      728     5.0  1351747347000
   MovieID                     Title  \
0        1      Release Some Tension   
1        2         Somewhere in Time   
2        3  Heavy Hearts, Hard Fists   
3        4      Malice in Wonderland   
4        5                Elvis Live   

                                              Genres  
0    Dance & Electronic|House|Dance|Electronic|House  
1  Soundtracks|Movie Scores|Soundtracks|Movie|Scores  
2                                            Pop|Pop  
3  Rock|Progressive|Progressive Rock|Rock|Progres...  
4                              Pop|Oldies|Pop|Oldies  
