In [1]:
import pandas as pd
from collections import defaultdict

In [12]:
chunk_size = 10000
file_path = r'C:/Users/leekh/OneDrive/Documents/movies.txt'
sample_file_path = r'C:/Users/leekh/OneDrive/Documents/movies_samples.csv'

In [19]:
# Initialize variables
parsed_data = []
user_review_counts = {}
record_count = 0
max_records = 100000  # Stop after processing 100,000 reviews (adjust as needed)
# Step 1: Parse the file and accumulate review counts
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    record = {}
    for line in file:
        line = line.strip()  # Remove extra whitespace
        if not line:  # Empty line indicates end of a record
            if record:  # Save the completed record
                user_id = record.get('review/userId', None)
                if user_id:
                    user_review_counts[user_id] = user_review_counts.get(user_id, 0) + 1
                parsed_data.append({
                    "review/userId": record.get('review/userId'),
                    "review/score": float(record.get('review/score', 0)),
                    "review/summary": record.get('review/summary'),
                    "review/text" : record.get('review/text')  # Assuming this contains the movie title
                })
                record = {}
                record_count += 1
                if record_count >= max_records:
                    break  # Stop after processing max_records
        elif ":" in line:
            key, value = line.split(":", 1)  # Split key and value
            record[key.strip()] = value.strip()

In [20]:
# Step 2: Identify users with more than 50 reviews
users_with_more_than_50_reviews = {user for user, count in user_review_counts.items() if count > 50}
print(f"Number of users with more than 50 reviews: {len(users_with_more_than_50_reviews)}")

# Step 3: Filter the parsed data for these users
filtered_data = [record for record in parsed_data if record.get('review/userId') in users_with_more_than_50_reviews]

# Step 4: Convert to a DataFrame and save
df = pd.DataFrame(filtered_data)

df = df.sort_values(by='review/userId')

# Save a smaller sample to a new file
sample_df = df.sample(n=10000, random_state=42) if len(df) > 10000 else df  # Randomly sample or use the full DataFrame if smaller
sample_df.to_csv(sample_file_path, sep='\t', index=False, encoding='ISO-8859-1')

print(f"Filtered sample file created: {sample_file_path}")

Number of users with more than 50 reviews: 38
Filtered sample file created: C:/Users/leekh/OneDrive/Documents/movies_samples.csv


In [None]:
file_path = r'C:/Users/leekh/OneDrive/Documents/movies_samples.csv'
data = pd.read_csv(file_path)

# Define the rating threshold
preference_threshold = 4.0

# Create dictionaries to hold preferences and unpreferences
user_preferences = defaultdict(list)
user_unpreferences = defaultdict(list)

# Populate preferences and unpreferences based on rating
for _, row in data.iterrows():
    user_id = row['review/userId']
    movie_title = row['review/summary']  # Assuming 'summary' field has the movie title
    rating = row['review/score']
    
    if rating >= preference_threshold:
        user_preferences[user_id].append(movie_title)
    else:
        user_unpreferences[user_id].append(movie_title)

# Construct TallRec-compatible structure
tallrec_data = []

for user_id in user_preferences.keys():
    # Skip users without both preference and unpreference lists
    if not user_preferences[user_id] or not user_unpreferences[user_id]:
        continue
    
    example = {
        "instruction": "Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\".",
        "input": f"User Preference: {', '.join(user_preferences[user_id])}\nUser Unpreference: {', '.join(user_unpreferences[user_id])}\nWhether the user will like the target movie \"Example Movie\"?",
        "output": "Yes."  # Or dynamically determine based on trends
    }
    
    tallrec_data.append(example)

# Convert to DataFrame or JSON for export
tallrec_df = pd.DataFrame(tallrec_data)
tallrec_df.to_json('tallrec_formatted_data.json', orient='records', lines=True)

print("TallRec-compatible data has been saved to 'tallrec_formatted_data.json'.")