#This file is used to process the original data of bookcrossing.

In [None]:
import pandas as pd

file_path = "book-crossing.csv"
output_path = "bookcross1.csv"

df = pd.read_csv(file_path, sep="\t", header=None, names=["user_id", "item_id", "rating"], 
                 dtype=str, skiprows=1, low_memory=False)

user_interactions = df["user_id"].value_counts()

valid_users = user_interactions[user_interactions >= 10].index
filtered_df = df[df["user_id"].isin(valid_users)]

filtered_df.to_csv(output_path, sep="\t", index=False, header=False)

print(f"✅ Processing complete! The new dataset has been saved as {output_path}")

In [None]:
import pandas as pd

file_path = "bookcross1.csv"
output_path = "bookcross2.csv"

df = pd.read_csv(file_path, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)
df_filtered = df[df["rating"] != "0"]

df_filtered.to_csv(output_path, sep="\t", index=False, header=False)

print(f"✅ Processing complete! Rows with a score of 0 have been removed and the new dataset has been saved as {output_path}")

In [None]:
import pandas as pd


input_path = "bookcross2.csv"
output_path = "bookcross3.csv"


df = pd.read_csv(input_path, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)

user_rating_counts = df.groupby(["user_id", "rating"]).size().unstack(fill_value=0)


valid_users = user_rating_counts[(user_rating_counts.get("10", 0) >= 3) & (user_rating_counts.get("9", 0) >= 2)].index


filtered_df = df[df["user_id"].isin(valid_users)]


filtered_df.to_csv(output_path, sep="\t", index=False, header=False)


num_users = filtered_df["user_id"].nunique()
num_items = filtered_df["item_id"].nunique()
num_interactions = len(filtered_df)


print(f"✅ Processing complete! The new dataset has been saved as {output_path}")
print(f"Number of users: {num_users}, Number of items: {num_items}, Number of interactions: {num_interactions}")

In [None]:
import pandas as pd

input_path = "bookcross3.csv"
output_path = "bookcross4.csv"

df = pd.read_csv(input_path, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)

user_interactions = df["user_id"].value_counts()

valid_users = user_interactions[user_interactions >= 10].index
filtered_df = df[df["user_id"].isin(valid_users)]

filtered_df.to_csv(output_path, sep="\t", index=False, header=False)

num_users = filtered_df["user_id"].nunique()
num_items = filtered_df["item_id"].nunique()
num_interactions = len(filtered_df)

print(f"✅ Processing complete! The new dataset has been saved as {output_path}")
print(f"Number of users: {num_users}, Number of items: {num_items}, Number of interactions: {num_interactions}")

In [None]:
import pandas as pd


input_path = "bookcross4.csv"
output_path = "bookcross5.csv"

df = pd.read_csv(input_path, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)


item_counts = df["item_id"].value_counts()
valid_items = item_counts[item_counts > 1].index  
df = df[df["item_id"].isin(valid_items)]

user_rating_counts = df.groupby(["user_id", "rating"]).size().unstack(fill_value=0)
valid_users = user_rating_counts[(user_rating_counts.get("10", 0) >= 3) & (user_rating_counts.get("9", 0) >= 2)].index
df = df[df["user_id"].isin(valid_users)]

user_interactions = df["user_id"].value_counts()
final_valid_users = user_interactions[user_interactions >= 10].index
df = df[df["user_id"].isin(final_valid_users)]


df.to_csv(output_path, sep="\t", index=False, header=False)

num_users = df["user_id"].nunique()
num_items = df["item_id"].nunique()
num_interactions = len(df)

print(f"✅ Processing complete! The new dataset has been saved as {output_path}")
print(f"Number of users: {num_users}, Number of items: {num_items}, Number of interactions: {num_interactions}")

In [None]:
import pandas as pd

item_file = "book-crossing.item.csv"
bookcross_file = "bookcross5.csv"
output_file = "bookcross8.csv"

df_items = pd.read_csv(item_file, sep="\t", header=None, dtype=str)  
item_column_name = df_items.columns[0] 
df_items.rename(columns={item_column_name: "item_id"}, inplace=True)  
valid_items = set(df_items["item_id"]) 

df = pd.read_csv(bookcross_file, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)

df = df[df["item_id"].isin(valid_items)]

item_counts = df["item_id"].value_counts()
valid_items = item_counts[item_counts > 1].index
df = df[df["item_id"].isin(valid_items)]

df["rating"] = pd.to_numeric(df["rating"], errors="coerce") 
user_rating_counts = df.groupby(["user_id", "rating"]).size().unstack(fill_value=0)
valid_users = user_rating_counts[(user_rating_counts.get(10, 0) >= 3) & (user_rating_counts.get(9, 0) >= 2)].index
df = df[df["user_id"].isin(valid_users)]

user_interactions = df["user_id"].value_counts()
valid_users = user_interactions[user_interactions >= 10].index
df = df[df["user_id"].isin(valid_users)]

df.to_csv(output_file, sep="\t", index=False, header=False)

num_users = df["user_id"].nunique()
num_items = df["item_id"].nunique()
num_interactions = len(df)

print(f"✅ Processing complete! The new dataset has been saved as {output_path}")
print(f"Number of users: {num_users}, Number of items: {num_items}, Number of interactions: {num_interactions}")

In [None]:
import pandas as pd

input_file = "bookcross8.csv"
sorted_output_file = "bookcross8-1.csv"
mapped_output_file = "bookcross8-2.csv"
user_map_file = "bookcross-user-map.txt"

df = pd.read_csv(input_file, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)

df_sorted = df.sort_values(by="user_id").reset_index(drop=True)

df_sorted.to_csv(sorted_output_file, sep="\t", index=False, header=False)

unique_users = df_sorted["user_id"].unique()
user_mapping = {old_id: str(new_id) for new_id, old_id in enumerate(unique_users, start=1)}

df_user_map = pd.DataFrame(list(user_mapping.items()), columns=["old_user_id", "new_user_id"])
df_user_map[["new_user_id", "old_user_id"]].to_csv(user_map_file, sep="\t", index=False, header=False)

df_sorted["user_id"] = df_sorted["user_id"].map(user_mapping)

df_sorted.to_csv(mapped_output_file, sep="\t", index=False, header=False)

num_users = df_sorted["user_id"].nunique()
num_items = df_sorted["item_id"].nunique()
num_interactions = len(df_sorted)

print(f"✅ Processing completed!")
print(f"📂 Sorted file saved: {sorted_output_file}")
print(f"📂 Remapped user ID file saved: {user_map_file}")
print(f"📂 File with `user_id` replaced with new ID saved: {mapped_output_file}")
print(f"📊 Number of users: {num_users}")
print(f"📊 Number of items: {num_items}")
print(f"📊 Number of interactions: {num_interactions}")

In [None]:
import pandas as pd

# 文件路径
bookcross_input = "bookcross8-2.csv"
item_input = "book-crossing.item.csv"
filtered_item_output = "book-crossing.item-11.csv"
mapped_bookcross_output = "bookcross8-3.csv"
item_map_file = "bookcross-item-map.txt" 


df_bookcross = pd.read_csv(bookcross_input, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)


df_items = pd.read_csv(item_input, sep="\t", dtype=str)


valid_items = set(df_bookcross["item_id"])
df_filtered_items = df_items[df_items.iloc[:, 0].isin(valid_items)]  


unique_items = df_filtered_items.iloc[:, 0].unique()
item_mapping = {old_id: str(new_id) for new_id, old_id in enumerate(unique_items, start=1)}


df_item_map = pd.DataFrame(list(item_mapping.items()), columns=["old_item_id", "new_item_id"])
df_item_map[["new_item_id", "old_item_id"]].to_csv(item_map_file, sep="\t", index=False, header=False)


df_filtered_items.iloc[:, 0] = df_filtered_items.iloc[:, 0].map(item_mapping)
df_filtered_items.to_csv(filtered_item_output, sep="\t", index=False, header=True)


df_bookcross["item_id"] = df_bookcross["item_id"].map(item_mapping)
df_bookcross.to_csv(mapped_bookcross_output, sep="\t", index=False, header=False)


num_items = df_bookcross["item_id"].nunique()
num_users = df_bookcross["user_id"].nunique()
num_interactions = len(df_bookcross)


print(f"✅ Processing completed!")
print(f"📂 Filtered and renumbered item file saved: {filtered_item_output}")
print(f"📂 Renumbered bookcross8-3.csv file saved: {mapped_bookcross_output}")
print(f"📂 item_id mapping file saved: {item_map_file}")
print(f"📊 Number of users: {num_users}")
print(f"📊 Number of items: {num_items}")
print(f"📊 Number of interactions: {num_interactions}")

In [None]:
import pandas as pd
import numpy as np

# 文件路径
input_file = "bookcross8-3.csv"
output_file = "bookcross8-4.csv"


df = pd.read_csv(input_file, sep="\t", header=None, names=["user_id", "item_id", "rating"], dtype=str)


def random_timestamp():
    date = pd.Timestamp(np.random.randint(pd.Timestamp("2000-01-01").timestamp(),
                                          pd.Timestamp("2023-12-31").timestamp(), dtype=np.int64), unit="s")
    return date.strftime("%Y%m%d") 


df["timestamp"] = df.groupby("user_id")["item_id"].transform(lambda x: np.random.choice([random_timestamp() for _ in range(len(x))], size=len(x)))


df.to_csv(output_file, sep="\t", index=False, header=False)


num_users = df["user_id"].nunique()
num_items = df["item_id"].nunique()
num_interactions = len(df)


print(f"✅ Processing completed! New dataset saved as {output_file}")
print(f"📊 Number of users: {num_users}")
print(f"📊 Number of items: {num_items}")
print(f"📊 Number of interactions: {num_interactions}")

In [None]:
import pandas as pd


input_file = "book-crossing.item-11.csv"
output_file = "book-crossing.item-12.csv"


df = pd.read_csv(input_file, sep="\t", header=0, dtype=str)  


df = df.iloc[:, :2]


df.to_csv(output_file, sep="|", index=False, header=False)


num_items = df.shape[0]


print(f"✅ Processing completed! New dataset saved as {output_file}")
print(f"📊 Number of items: {num_items}")

In [None]:
book-crossing.item-12.csv
bookcross8-4.csv