In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
PATH_OF_MOVIES = '/content/drive/MyDrive/movies.txt'

Mounted at /content/drive


In [None]:
import pandas as pd
from collections import defaultdict
import html
from html import escape
!pip install jsonlines
import jsonlines
import json

/EC523DL/Final_project/TALLRec/evaluate.py

file_path = PATH_OF_MOVIES
sample_file_path = '/content/drive/MyDrive/EC523DL/Final_project/TALLRec/data/amazon/movies_samples.csv'
# Initialize variables
parsed_data = []
record_count = 0
max_records = 800000  # adjust as needed

# Step 1: Parse the file and accumulate data
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    record = {}
    for line in file:
        line = line.strip()
        if not line:
            if record:
                parsed_data.append({
                    "review/userId": record.get('review/userId'),
                    "review/score": float(record.get('review/score', 0)),
                    "review/summary": record.get('review/summary'),
                    "review/text" : record.get('review/text'),
                    "review/productId" : record.get('product/productId'),
                    "review/time": record.get('review/time')
                })
                record = {}
                record_count += 1
                if record_count >= max_records:
                    break
        elif ":" in line:
            key, value = line.split(":", 1)
            record[key.strip()] = value.strip()

# Step 2: Convert parsed data to DataFrame
df = pd.DataFrame(parsed_data)

# Step 3: Drop duplicates
df = df.drop_duplicates(subset=['review/summary'])
df = df.drop_duplicates(subset=['review/text'])

# Step 4: Recalculate review counts after removing duplicates
user_review_counts = df['review/userId'].value_counts()


# Step 5: Filter for users with more than 10 but less than 50 reviews
users_with_10_to_50_reviews = user_review_counts[(user_review_counts > 10) & (user_review_counts <= 50)].index
filtered_data = df[df['review/userId'].isin(users_with_10_to_50_reviews)]

# Step 6: Save to CSV
filtered_data = filtered_data.sort_values(by=['review/userId', 'review/time'])
filtered_data.to_csv(sample_file_path, sep='\t', index=False, encoding='ISO-8859-1')

print(f"Number of users with more than 10 but less than 50 reviews: {len(users_with_10_to_50_reviews)}")
print(f"Filtered sample file created: {sample_file_path}")


# Now read the deduplicated file
data = pd.read_csv(sample_file_path, encoding='ISO-8859-1', sep='\t')
data = data.sort_values(by=['review/userId', 'review/time'])

# Define rating threshold
preference_threshold = 4.0

user_preferences = defaultdict(list)
user_unpreferences = defaultdict(list)

for _, row in data.iterrows():
    user_id = row['review/userId']
    movie_info = f"{row['review/productId']}: {row['review/summary']}"
    rating = row['review/score']
    if rating >= preference_threshold:
        user_preferences[user_id].append(movie_info)
        print('appended to preference')
    else:
        user_unpreferences[user_id].append(movie_info)

train_data = []
val_data = []
test_data = []

for user_id, group in data.groupby('review/userId'):
    if not user_preferences[user_id] or not user_unpreferences[user_id]:
        continue

    # Sort user group by time
    user_group = group.sort_values(by='review/time')

    # Ensure there are at least three entries
    if len(user_group) < 3:
        continue

    # Select the -3rd entry for training set
    train_movie = f"{user_group.iloc[-3]['review/productId']}"
    train_output_label = "Yes." if train_movie in user_preferences[user_id] else "No."
    if train_movie in user_preferences[user_id]:
        user_preferences[user_id].remove(train_movie)
    if train_movie in user_unpreferences[user_id]:
        user_unpreferences[user_id].remove(train_movie)

    # Drop the -3rd entry from the data
    data = data.drop(user_group.iloc[-3].name)


    val_movie = f"{user_group.iloc[-2]['review/productId']}"
    val_output_label = "Yes." if val_movie in user_preferences[user_id] else "No."
    if val_movie in user_preferences[user_id]:
        user_preferences[user_id].remove(val_movie)
    if val_movie in user_unpreferences[user_id]:
        user_unpreferences[user_id].remove(val_movie)
    # Drop the -2nd entry from the data
    data = data.drop(user_group.iloc[-2].name)

    test_movie = f"{user_group.iloc[-1]['review/productId']}"
    test_output_label = "Yes." if test_movie in user_preferences[user_id] else "No."
    if test_movie in user_preferences[user_id]:
        user_preferences[user_id].remove(test_movie)
    if test_movie in user_unpreferences[user_id]:
        user_unpreferences[user_id].remove(test_movie)

    # Drop the -1st entry from the data
    data = data.drop(user_group.iloc[-1].name)

    # Get unique preferences and unpreferences
    unique_prefs = list(set(user_preferences[user_id]))
    unique_unprefs = list(set(user_unpreferences[user_id]))

    # Create train example
    train_example = {
        "instruction": html.unescape("Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\"."),
        "input": html.unescape(
            f"User Preference: {', '.join(unique_prefs)}\n"
            f"User Unpreference: {', '.join(unique_unprefs)}\n"
            f"Whether the user will like the target movie \"{train_movie}\"?"
        ),
        "output": train_output_label
    }
    train_data.append(train_example)

    # Create test example
    test_example = {
        "instruction": html.unescape("Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\"."),
        "input": html.unescape(
            f"User Preference: {', '.join(unique_prefs)}\n"
            f"User Unpreference: {', '.join(unique_unprefs)}\n"
            f"Whether the user will like the target movie \"{test_movie}\"?"
        ),
        "output": test_output_label
    }
    test_data.append(test_example)

    # Create validation example
    val_example = {
        "instruction": html.unescape("Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\"."),
        "input": html.unescape(
            f"User Preference: {', '.join(unique_prefs)}\n"
            f"User Unpreference: {', '.join(unique_unprefs)}\n"
            f"Whether the user will like the target movie \"{val_movie}\"?"
        ),
        "output": val_output_label
    }
    val_data.append(val_example)

# Save train, validation, and test datasets
with open('train_movie.json', 'w') as file:
    json.dump(train_data, file, indent=2)

with open('test_movie.json', 'w') as file:
    json.dump(test_data, file, indent=2)

with open('valid_movie.json', 'w') as file:
    json.dump(val_data, file, indent=2)

print("Data has been split and saved into 'train_movie.json', 'test_movie.json', and 'valid_movie.json'.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to preference
appended to pre