In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
PATH_OF_MOVIES = '/content/drive/MyDrive/movies.txt'

Mounted at /content/drive


In [3]:
import pandas as pd
from collections import defaultdict
import html
from html import escape


file_path = PATH_OF_MOVIES
sample_file_path = '/content/drive/MyDrive/movies_samples.csv'
# Initialize variables
parsed_data = []
user_review_counts = {}
record_count = 0
max_records = 800000  # adjust as needed

# Step 1: Parse the file and accumulate review counts
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    record = {}
    for line in file:
        line = line.strip()
        if not line:
            if record:
                user_id = record.get('review/userId', None)
                if user_id:
                    user_review_counts[user_id] = user_review_counts.get(user_id, 0) + 1
                parsed_data.append({
                    "review/userId": record.get('review/userId'),
                    "review/score": float(record.get('review/score', 0)),
                    "review/summary": record.get('review/summary'),
                    "review/text" : record.get('review/text'),
                    "review/productId" : record.get('product/productId'),
                    "review/time": record.get('review/time')
                })
                record = {}
                record_count += 1
                if record_count >= max_records:
                    break
        elif ":" in line:
            key, value = line.split(":", 1)
            record[key.strip()] = value.strip()

# Step 2: Identify users with exactly 10 reviews
users_with_10_reviews = {user for user, count in user_review_counts.items() if count == 10}

print(f"Number of users with 10 reviews: {len(users_with_10_reviews)}")

# Step 3: Filter the parsed data for these users
filtered_data = [r for r in parsed_data if r.get('review/userId') in users_with_10_reviews]

# Step 4: Convert to DataFrame
df = pd.DataFrame(filtered_data)
df = df.sort_values(by=['review/userId', 'review/time'])

# Drop duplicates before saving
df = df.drop_duplicates(subset=['review/summary'])
df = df.drop_duplicates(subset=['review/text'])
df.to_csv(sample_file_path, sep='\t', index=False, encoding='ISO-8859-1')
print(f"Filtered sample file created (deduplicated): {sample_file_path}")

# Now read the deduplicated file
data = pd.read_csv(sample_file_path, encoding='ISO-8859-1', sep='\t')
data = data.sort_values(by=['review/userId', 'review/time'])

# Define rating threshold
preference_threshold = 4.0

user_preferences = defaultdict(list)
user_unpreferences = defaultdict(list)

for _, row in data.iterrows():
    user_id = row['review/userId']
    #movie_info = row['review/text']
    movie_info = f"{row['review/productId']}: {row['review/summary']}"
    rating = row['review/score']
    if rating >= preference_threshold:
        user_preferences[user_id].append(movie_info)
    else:
        user_unpreferences[user_id].append(movie_info)

tallrec_data = []

for user_id, group in data.groupby('review/userId'):
    if not user_preferences[user_id] or not user_unpreferences[user_id]:
        continue
    user_group = group.sort_values(by='review/time')
    #example_movie = user_group.iloc[-1]['review/text']
    example_movie = f"{user_group.iloc[-1]['review/productId']}: {user_group.iloc[-1]['review/summary']}"

    output_label = "Yes." if example_movie in user_preferences[user_id] else "No."
    if example_movie in user_preferences[user_id]:
        user_preferences[user_id].remove(example_movie)
    if example_movie in user_unpreferences[user_id]:
        user_unpreferences[user_id].remove(example_movie)
    data = data.drop(user_group.iloc[-1].name)
    unique_prefs = list(set(user_preferences[user_id]))
    unique_unprefs = list(set(user_unpreferences[user_id]))

    '''example = {
    "instruction": html.unescape("Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\"."),
    "input": html.unescape(f"User Preference: {', '.join(unique_prefs)}\nUser Unpreference: {', '.join(unique_unprefs)}\nWhether the user will like the target movie \"{example_movie}\"?"),
    "output": output_label
    }'''
    example = {
        "instruction": html.unescape("Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\"."),
        "input": html.unescape(
            f"User Preference: {', '.join(unique_prefs)}\n"
            f"User Unpreference: {', '.join(unique_unprefs)}\n"
            f"Whether the user will like the target movie \"{example_movie}\"?"
        ),
        "output": output_label
    }

    tallrec_data.append(example)

tallrec_df = pd.DataFrame(tallrec_data)
tallrec_df.reset_index(drop=True, inplace=True)
# Deduplicate the final tallrec data
tallrec_df = tallrec_df.drop_duplicates(subset=['instruction','input','output'])
tallrec_df.to_json('tallrec_formatted_data.json', orient='records', lines=True)

print("TallRec-compatible data has been saved to 'tallrec_formatted_data.json'.")

# Split into train, valid, test
!pip install jsonlines
import json
import jsonlines

tallrec_file_path = "tallrec_formatted_data.json"

data = []
with jsonlines.open(tallrec_file_path, 'r') as reader:
    for obj in reader:
        data.append(obj)

total_interactions = len(data)
print(f"Total interactions: {total_interactions}")

train_end = int(0.8 * total_interactions)
val_end = train_end + int(0.1 * total_interactions)

train_data = data[:train_end]
validation_data = data[train_end:val_end]
test_data = data[val_end:]

with open('train_movie.json', 'w') as file:
    json.dump(train_data, file, indent=2)

with open('valid_movie.json', 'w') as file:
    json.dump(validation_data, file, indent=2)

with open('test_movie.json', 'w') as file:
    json.dump(test_data, file, indent=2)

print("Data has been split and saved into 'train_movie.json', 'valid_movie.json', and 'test_movie.json'.")


Number of users with 10 reviews: 1031
Filtered sample file created (deduplicated): /content/drive/MyDrive/movies_samples.csv
TallRec-compatible data has been saved to 'tallrec_formatted_data.json'.
Total interactions: 686
Data has been split and saved into 'train_movie.json', 'valid_movie.json', and 'test_movie.json'.
