In [None]:
import pandas as pd


item_file = "item.csv"
inter_file = "inter.csv"
output_file = "inter1.csv"


df_items = pd.read_csv(item_file, sep="\t", dtype=str)  
valid_items = set(df_items.iloc[:, 0]) 


df_inter = pd.read_csv(inter_file, sep="\t", dtype=str)


df_filtered = df_inter[df_inter.iloc[:, 1].isin(valid_items)]


df_filtered.to_csv(output_file, sep="\t", index=False)


total_rows = len(df_inter)
filtered_rows = len(df_filtered)
removed_rows = total_rows - filtered_rows

print(f"✅ Processing complete! The new dataset has been saved as {output_file}")
print(f"Total number of interactions (inter.csv): {total_rows}")
print(f"Matched interactions: {filtered_rows}")
print(f"Number of interactions deleted: {removed_rows}")

In [None]:
import pandas as pd

input_file = "inter1.csv"
output_file = "inter2.csv"

df = pd.read_csv(input_file, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"], dtype=str)

user_interaction_counts = df["user_id"].value_counts()

valid_users = user_interaction_counts[user_interaction_counts >= 10].index
df_filtered = df[df["user_id"].isin(valid_users)]

df_filtered.to_csv(output_file, sep="\t", index=False, header=False)

num_users = df_filtered["user_id"].nunique()
num_items = df_filtered["item_id"].nunique()
num_interactions = len(df_filtered)

print(f"✅ Processing complete! The new dataset has been saved as {output_file}")
print(f"Total number of interactions (inter.csv): {total_rows}")
print(f"Matched interactions: {filtered_rows}")
print(f"Number of interactions deleted: {removed_rows}")

In [None]:
import pandas as pd

inter_file = "inter2.csv"
item_file = "item.csv"
output_file = "inter2-1.csv"

df_inter = pd.read_csv(inter_file, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"], dtype=str)
df_item = pd.read_csv(item_file, sep="\t", header=None, names=["item_id", "title", "price", "sales_type", "sales_rank", "categories", "brand"], dtype=str)

df_merged = df_inter.merge(df_item[["item_id", "title"]], on="item_id", how="left")

df_filtered = df_merged.dropna(subset=["title"]).loc[df_merged["title"].str.strip() != ""]

df_filtered = df_filtered[["user_id", "item_id", "rating", "timestamp"]]

df_filtered.to_csv(output_file, sep="\t", index=False, header=False)

original_users = df_inter["user_id"].nunique()
original_items = df_inter["item_id"].nunique()
original_interactions = len(df_inter)

filtered_users = df_filtered["user_id"].nunique()
filtered_items = df_filtered["item_id"].nunique()
filtered_interactions = len(df_filtered)

print(f"✅ Processing completed! New dataset saved as {output_file}")
print(f"📊 Original data: users = {original_users}, items = {original_items}, interactions = {original_interactions}")
print(f"📉 Filtered data: users = {filtered_users}, items = {filtered_items}, interactions = {filtered_interactions}")
print(f"🚨 Deleted interactions: {original_interactions - filtered_interactions}")

In [None]:
import pandas as pd


input_file = "inter2-1.csv"
output_file = "inter2-2.csv"

df = pd.read_csv(input_file, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"], dtype=str)


df["rating"] = df["rating"].astype(float).astype(int).astype(str)


item_counts = df["item_id"].value_counts()
valid_items = item_counts[item_counts > 1].index
df = df[df["item_id"].isin(valid_items)]


df["rating"] = df["rating"].astype(int) 
user_rating_counts = df.groupby(["user_id", "rating"]).size().unstack(fill_value=0)
valid_users = user_rating_counts[(user_rating_counts.get(5, 0) >= 3) & (user_rating_counts.get(4, 0) >= 2)].index
df = df[df["user_id"].isin(valid_users)]


user_interactions = df["user_id"].value_counts()
valid_users = user_interactions[user_interactions >= 10].index
df = df[df["user_id"].isin(valid_users)]


df.to_csv(output_file, sep="\t", index=False, header=False)


num_users = df["user_id"].nunique()
num_items = df["item_id"].nunique()
num_interactions = len(df)

print(f"✅ Processing completed! New dataset saved as {output_file}")
print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")
print(f"Number of interactions: {num_interactions}")

In [None]:
import pandas as pd

inter_file = "inter2-2.csv"
item_file = "item.csv"
item_output = "itemnew.csv"
inter_output = "inter2-3.csv"
mapping_file = "item_id_mapping.txt"  

df_inter = pd.read_csv(inter_file, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"], dtype=str)

df_item = pd.read_csv(item_file, sep="\t", header=0, usecols=[0, 1], names=["item_id", "title"], dtype=str)

valid_items = set(df_inter["item_id"])
df_item_filtered = df_item[df_item["item_id"].isin(valid_items)].copy()

item_id_mapping = {old_id: str(new_id) for new_id, old_id in enumerate(df_item_filtered["item_id"].unique(), start=1)}

df_item_filtered["item_id"] = df_item_filtered["item_id"].map(item_id_mapping)

df_item_filtered.to_csv(item_output, sep="\t", index=False, header=False)

df_inter["item_id"] = df_inter["item_id"].map(item_id_mapping)

df_inter.to_csv(inter_output, sep="\t", index=False, header=False)

df_mapping = pd.DataFrame(list(item_id_mapping.items()), columns=["old_item_id", "new_item_id"])
df_mapping.to_csv(mapping_file, sep="\t", index=False, header=False)

num_original_items = df_item.shape[0]
num_filtered_items = df_item_filtered.shape[0]
num_interactions = len(df_inter)

print(f"✅ Processing completed!")
print(f"📂 New item file: {item_output} (total {num_filtered_items} items)")
print(f"📂 New inter2 file: {inter_output} (total {num_interactions} interactions)")
print(f"📂 item_id mapping table: {mapping_file}")
print(f"🔄 Original item.csv number of items: {num_original_items}")
print(f"🔍 Retained item number: {num_filtered_items}")
print(f"📊 Interaction number: {num_interactions}")

In [None]:
import pandas as pd

inter_file = "inter2-3.csv"
output_file = "inter2-4.csv"
user_mapping_file = "user_id_mapping.txt" 


df_inter = pd.read_csv(inter_file, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"], dtype=str)


user_id_mapping = {old_id: str(new_id) for new_id, old_id in enumerate(df_inter["user_id"].unique(), start=1)}


df_inter["user_id"] = df_inter["user_id"].map(user_id_mapping)


df_inter.to_csv(output_file, sep="\t", index=False, header=False)


df_mapping = pd.DataFrame(list(user_id_mapping.items()), columns=["old_user_id", "new_user_id"])
df_mapping.to_csv(user_mapping_file, sep="\t", index=False, header=False)


num_original_users = len(user_id_mapping)
num_interactions = len(df_inter)


print(f"✅ Processing completed!")
print(f"📂 New inter file: {output_file} (total {num_interactions} interactions)")
print(f"📂 user_id mapping table: {user_mapping_file}")
print(f"🔄 Original number of users: {num_original_users}")
print(f"📊 Number of interactions: {num_interactions}")

In [None]:
import pandas as pd

inter_file = "inter2-4.csv"
output_file = "inter2-5.csv"


df_inter = pd.read_csv(inter_file, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])


df_inter["user_id"] = df_inter["user_id"].astype(int) 
df_sorted = df_inter.sort_values(by=["user_id"], ascending=True)

df_sorted.to_csv(output_file, sep="\t", index=False, header=False)


num_users = df_sorted["user_id"].nunique()
num_interactions = len(df_sorted)


print(f"✅ Processing completed!")
print(f"📂 New inter file: {output_file} (total {num_interactions} interactions)")
print(f"🔍 Number of users: {num_users}")
print(f"📊 Number of interactions: {num_interactions}")

In [None]:
input_file = "itemnew.csv"
output_file = "itemnew-processed.csv"

with open(input_file, "r", encoding="utf-8") as f:
    content = f.read().replace("\t", "|")  

with open(output_file, "w", encoding="utf-8") as f:
    f.write(content)


print(f"✅ Processing completed! New file saved as {output_file}")