#Amazon Dataset Subsets (10 smallest)

source:
https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html

The code provided efficiently processes multiple datasets, subsets from the "Amazon Review 2014" dataset to prepare them for use with RecBole, a recommender system framework. It reads each dataset, renames columns to match the required format (`user_id:token`, `item_id:token`, `rating:float`, `timestamp:float`), and then saves the data into three files: an interaction file (`*.inter`), a user file (`*.user`), and an item file (`*.item`). The code also ensures that each dataset's output files are stored in a structured directory, and it confirms the success of each step by printing the first few rows of the generated files.

In [None]:
import os
import pandas as pd

# List of datasets
files = [
    'ratings_Amazon_Instant_Video.csv',
    'ratings_Musical_Instruments.csv',
    'ratings_Digital_Music.csv',
    'ratings_Baby.csv',
    'ratings_Patio_Lawn_and_Garden.csv',
    'ratings_Office_Products.csv',
    'ratings_Grocery_and_Gourmet_Food.csv',
    'ratings_Pet_Supplies.csv',
    'ratings_Automotive.csv',
    'ratings_Apps_for_Android.csv'
]

# Function to process each file
def process_file(file_name):
    # Load the dataset
    df = pd.read_csv(file_name, header=None)

    # Rename columns to match RecBole's requirements
    df.columns = ['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float']

    # Create the directory structure
    dataset_name = os.path.splitext(file_name)[0]  # Remove .csv extension
    output_dir = f'dataset/{dataset_name}'
    os.makedirs(output_dir, exist_ok=True)

    # Save the interaction file
    output_file_inter = f'{output_dir}/{dataset_name}.inter'
    df.to_csv(output_file_inter, index=False, sep='\t')

    # Create and save the user file
    user_df = pd.DataFrame(df['user_id:token'].unique(), columns=['user_id:token'])
    output_file_user = f'{output_dir}/{dataset_name}.user'
    user_df.to_csv(output_file_user, index=False, sep='\t')

    # Create and save the item file
    item_df = pd.DataFrame(df['item_id:token'].unique(), columns=['item_id:token'])
    output_file_item = f'{output_dir}/{dataset_name}.item'
    item_df.to_csv(output_file_item, index=False, sep='\t')

    # Print out the first few rows of each file for confirmation
    print(f"First few rows of the interaction file for {dataset_name}:")
    print(df.head())
    print(f"Interaction file saved as '{output_file_inter}'.")

    print(f"First few lines of the user file for {dataset_name}:")
    print(user_df.head())
    print(f"User file saved as '{output_file_user}'.")

    print(f"First few lines of the item file for {dataset_name}:")
    print(item_df.head())
    print(f"Item file saved as '{output_file_item}'.")
    print("-" * 30)

# Process each file in the list
for file in files:
    process_file(file)

First few rows of the interaction file for ratings_Amazon_Instant_Video:
    user_id:token item_id:token  rating:float  timestamp:float
0  A1EE2E3N7PW666    B000GFDAUG           5.0       1202256000
1   AGZ8SM1BGK3CK    B000GFDAUG           5.0       1198195200
2  A2VHZ21245KBT7    B000GIOPK2           4.0       1215388800
3   ACX8YW2D5EGP6    B000GIOPK2           4.0       1185840000
4   A9RNMO9MUSMTJ    B000GIOPK2           2.0       1281052800
Interaction file saved as 'dataset/ratings_Amazon_Instant_Video/ratings_Amazon_Instant_Video.inter'.
First few lines of the user file for ratings_Amazon_Instant_Video:
    user_id:token
0  A1EE2E3N7PW666
1   AGZ8SM1BGK3CK
2  A2VHZ21245KBT7
3   ACX8YW2D5EGP6
4   A9RNMO9MUSMTJ
User file saved as 'dataset/ratings_Amazon_Instant_Video/ratings_Amazon_Instant_Video.user'.
First few lines of the item file for ratings_Amazon_Instant_Video:
  item_id:token
0    B000GFDAUG
1    B000GIOPK2
2    B000GIPKWY
3    B000GJUQ7M
4    B000GK0NBK
Item file saved a

# Google Restaurants 108K dataset

source:
https://cseweb.ucsd.edu/~jmcauley/datasets.html#google_restaurants
https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal_restaurants/

## Original Dataset Information

- **Dataset Name**: google_restaurants_108K.json
- **File Size**: 112 MB
- **Features** (in the 'train' part of the data):
  - `user_id`
  - `business_id`
  - `rating`

## Edited Dataset Information

### Interaction File (`*.inter`)

- **Features**:
  - `user_id:token` (corresponds to `user_id`)
  - `item_id:token` (corresponds to `business_id`)
  - `rating:float` (corresponds to `rating`)

### User File (`*.user`)

- **Features**:
  - `user_id:token` (list of unique user IDs from `user_id`)

### Item File (`*.item`)

- **Features**:
  - `item_id:token` (list of unique item IDs from `business_id`)

  




In [None]:
import os
import pandas as pd
import json

# Load the dataset
file_path = 'google_restaurants_108K.json'
with open(file_path, 'r') as f:
    data = json.load(f)

# Flatten the 'train' part of the data
df_train = pd.json_normalize(data['train'])

# Extract only the necessary columns for RecBole
df_inter = df_train[['user_id', 'business_id', 'rating']]

# Rename columns to match RecBole's expected format
df_inter.columns = ['user_id:token', 'item_id:token', 'rating:float']

# Create the directory structure
output_dir = 'dataset/google_restaurants_108K'
os.makedirs(output_dir, exist_ok=True)

# Save the interaction file without a timestamp
output_file_inter = f'{output_dir}/google_restaurants_108K.inter'
df_inter.to_csv(output_file_inter, index=False, sep='\t')

# Create and save the user file
user_df = pd.DataFrame(df_inter['user_id:token'].unique(), columns=['user_id:token'])
output_file_user = f'{output_dir}/google_restaurants_108K.user'
user_df.to_csv(output_file_user, index=False, sep='\t')

# Create and save the item file
item_df = pd.DataFrame(df_inter['item_id:token'].unique(), columns=['item_id:token'])
output_file_item = f'{output_dir}/google_restaurants_108K.item'
item_df.to_csv(output_file_item, index=False, sep='\t')

# Print out the first few rows of each file for verification
print("First few rows of the interaction file:")
print(pd.read_csv(output_file_inter, sep='\t').head())

print("First few rows of the user file:")
print(pd.read_csv(output_file_user, sep='\t').head())

print("First few rows of the item file:")
print(pd.read_csv(output_file_item, sep='\t').head())

print("Data restructuring and verification complete. Files are ready for RecBole.")

First few rows of the interaction file:
           user_id:token             item_id:token  rating:float
0  101074926318992653684  60567465d335d0abfb415b26             4
1  117065749986299237881  6050fa9f5b4ccec8d5cae994             5
2  106700937793048450809  604be10877e81aaed3cc9a1e             4
3  101643045857250355161  60411e017cd8bf130362365a             5
4  109802745326785766951  604139dd7cd8bf1303624208             4
First few rows of the user file:
           user_id:token
0  101074926318992653684
1  117065749986299237881
2  106700937793048450809
3  101643045857250355161
4  109802745326785766951
First few rows of the item file:
              item_id:token
0  60567465d335d0abfb415b26
1  6050fa9f5b4ccec8d5cae994
2  604be10877e81aaed3cc9a1e
3  60411e017cd8bf130362365a
4  604139dd7cd8bf1303624208
Data restructuring and verification complete. Files are ready for RecBole.


# Kaggle Beer rating dataset
source:
https://www.kaggle.com/datasets/ankurnapa/rate-beer-data?resource=download

## Original Dataset Information

- **Dataset Name**: beer_reviews.csv
- **File Size**: 173 MB
- **Features**:
  - `brewery_id`
  - `brewery_name`
  - `review_time`
  - `review_overall`
  - `review_aroma`
  - `review_appearance`
  - `review_profilename`
  - `beer_style`
  - `review_palate`
  - `review_taste`
  - `beer_name`
  - `beer_abv`
  - `beer_beerid`

## Edited Dataset Information

### Interaction File (`*.inter`)

- **Features**:
  - `user_id:token` (corresponds to `review_profilename`)
  - `item_id:token` (corresponds to `beer_beerid`)
  - `rating:float` (corresponds to `review_overall`)

### User File (`*.user`)

- **Features**:
  - `user_id:token` (list of unique user IDs from `review_profilename`)

### Item File (`*.item`)

- **Features**:
  - `item_id:token` (list of unique item IDs from `beer_beerid`)


## Edited Dataset Information

### Interaction File (`*.inter`)

- **Features**:
  - `user_id:token` (corresponds to `review_profilename`)
  - `item_id:token` (corresponds to `beer_beerid`)
  - `rating:float` (corresponds to `review_overall`)

### User File (`*.user`)

- **Features**:
  - `user_id:token` (list of unique user IDs from `review_profilename`)

### Item File (`*.item`)

- **Features**:
  - `item_id:token` (list of unique item IDs from `beer_beerid`)


In [5]:
import pandas as pd

# Load the CSV file
file_path = 'beer_reviews.csv'
df = pd.read_csv(file_path)

# Display the first few rows to inspect the data
print("First few rows of the data:")
print(df.head())

# Display column names to understand the structure
print("\nColumn names:")
print(df.columns)

# Rename columns to match RecBole's requirements
df_inter = df.rename(columns={
    'review_profilename': 'user_id:token',
    'beer_beerid': 'item_id:token',
    'review_overall': 'rating:float'
})

# Display the first few rows of the interaction data
print("\nFirst few rows of the interaction file:")
print(df_inter[['user_id:token', 'item_id:token', 'rating:float']].head())

# Save the interaction data to *.inter file
inter_file_path = 'dataset/beer_reviews.inter'
df_inter[['user_id:token', 'item_id:token', 'rating:float']].to_csv(inter_file_path, index=False, header=False)

# Create and save the unique users and items
unique_users = df_inter['user_id:token'].drop_duplicates().reset_index(drop=True)
unique_items = df_inter['item_id:token'].drop_duplicates().reset_index(drop=True)

# Save the unique users to *.user file
user_file_path = 'dataset/beer_reviews.user'
unique_users.to_csv(user_file_path, index=False, header=False)

# Save the unique items to *.item file
item_file_path = 'dataset/beer_reviews.item'
unique_items.to_csv(item_file_path, index=False, header=False)

# Display the first few rows of the user and item files
print("\nFirst few rows of the user file:")
print(pd.read_csv(user_file_path, header=None).head())

print("\nFirst few rows of the item file:")
print(pd.read_csv(item_file_path, header=None).head())

print("\nFiles have been saved:")
print(f"Interaction file: {inter_file_path}")
print(f"User file: {user_file_path}")
print(f"Item file: {item_file_path}")

First few rows of the data:
   brewery_id             brewery_name  review_time  review_overall  \
0       10325          Vecchio Birraio   1234817823             1.5   
1       10325          Vecchio Birraio   1235915097             3.0   
2       10325          Vecchio Birraio   1235916604             3.0   
3       10325          Vecchio Birraio   1234725145             3.0   
4        1075  Caldera Brewing Company   1293735206             4.0   

   review_aroma  review_appearance review_profilename  \
0           2.0                2.5            stcules   
1           2.5                3.0            stcules   
2           2.5                3.0            stcules   
3           3.0                3.5            stcules   
4           4.5                4.0     johnmichaelsen   

                       beer_style  review_palate  review_taste  \
0                      Hefeweizen            1.5           1.5   
1              English Strong Ale            3.0           3.0   
2   