In [None]:
import pandas as pd
import json
import re
import ast
import random
import datetime
import pickle
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path

### IMDB reviews

In [None]:
data_dir = '../data/common/imdb' 
with open(f'{data_dir}/user-reviews-1079.json', 'r') as file: 
    data = json.load(file)
    
save_dir = Path('../data/t1_items/imdb')
save_dir.mkdir(exist_ok=True, parents=True)

In [None]:
n_reviews = 0
for u_id, reviews in data.items():
    n_reviews += len(reviews)
print(n_reviews)

In [None]:
processed = []
title2count = defaultdict(int)

for u_id, reviews in data.items():
    
    if len(reviews) < 11:
        continue
    
    random.shuffle(reviews)
    
    anchor_str = ""
    test_answer = []
    
    for i, r in enumerate(reviews):
        
        item = r['movie_title']    
        review_title = r['review_title']
        
        if i < 10:  # 10 anchor reviews
            anchor_str += f"{item}: {review_title}\n"
        else:
            test_answer.append(item)  # rest is prediction
            title2count[item] += 1
            
    processed.append({
        "anchor_str": anchor_str, 
        "test_answer": test_answer})
    
processed_df = pd.DataFrame(processed)

In [None]:
processed_df.head()

In [None]:
len(processed_df), len(title2count)

In [None]:
processed_df.to_csv(save_dir / 'processed.csv')
with open(save_dir / 'title2count.json', 'w') as f:
    json.dump(title2count, f, indent=4)

### Redial 

In [None]:
data_dir = '../data/common/redial'
data = []
with open(f'{data_dir}/train_data.jsonl', 'r') as file:   
    for line in file:
        data.append(json.loads(line))
    
save_dir = Path('../data/t1_items/redial')
save_dir.mkdir(exist_ok=True, parents=True)

In [None]:

processed = []
title2count = defaultdict(int)

for i, d in enumerate(data):
    
    mid2mtitle = d['movieMentions']
    mid2context = d['respondentQuestions']
    
    mtitles = []
    for mid in mid2context:
        context = mid2context[mid]
        mtitle = mid2mtitle[mid]
        if not context['suggested']:
            mtitles.append(mtitle)
            
    if len(mtitles) < 3:
        continue

    anchor_str = []
    test_answer = []
    
    for i, mtitle in enumerate(mtitles):
    
        if i < 2:  # two anchor reviews
            if mtitle:
                anchor_str.append(mtitle)
        else:
            test_answer.append(mtitle)
            title2count[mtitle] += 1 
            
    processed.append({
        "anchor_str": ' and '.join(anchor_str), 
        "test_answer": test_answer})
    
processed_df = pd.DataFrame(processed)

In [None]:
len(processed_df), len(title2count)

In [None]:
processed_df.to_csv(save_dir / 'processed.csv')
with open(save_dir / 'title2count.json', 'w') as f:
    json.dump(title2count, f, indent=4)

### Reddit

In [None]:
# Get processed data from t4_requests
df = pd.read_csv('../data/t4_requests/requests.csv')

save_dir = Path('../data/t1_items/reddit')
save_dir.mkdir(exist_ok=True, parents=True)

In [None]:
df.head(2)

In [None]:
movies_per_submission = df['extracted_names'].apply(ast.literal_eval).tolist()

In [None]:
lengths = []
for movies in movies_per_submission:
    lengths.append(len(movies))

length2count = defaultdict(int)
for l in lengths:
    length2count[l] += 1

In [None]:
for l in range(20):
    print(l, length2count[l])

In [None]:
processed = []
title2count = defaultdict(int)

for i, row in df.iterrows():
    
    utc_time = row['utc_time']
    movies = ast.literal_eval(row['extracted_names'])
    
    if len(movies) < 3:
        continue

    for m in movies:
        title2count[m] += 1
    
    anchor_movies = movies[:2]
    test_answer = movies[2:]
            
    processed.append({
        "utc_time": utc_time,  # This is unique to this data
        "anchor_str": ' and '.join(anchor_movies), 
        "test_answer": test_answer})
    
processed_df = pd.DataFrame(processed)

In [None]:
len(processed_df), len(title2count)

In [None]:
processed_df

In [None]:
processed_df.to_csv(save_dir / 'processed.csv')
with open(save_dir / 'title2count.json', 'w') as f:
    json.dump(title2count, f, indent=4)