In [None]:
import pandas as pd
import json
import re
import ast
import random
from tqdm import tqdm
from pathlib import Path

In [None]:
with open(f'../data/common/reddit/id2name.json', 'r') as file:
    id2name = json.load(file)

In [None]:
dsplits = ['test', 'valid', 'train']

split_dfs = []
for dsplit in dsplits:
    split_df = pd.read_csv(f'../data/common/reddit/{dsplit}.csv')
    split_dfs.append(split_df)

df = pd.concat(split_dfs, ignore_index=True)

In [None]:
# extract movie names
def extract_movie_names(row):
    ids = re.findall(r"tt\d+", row)  
    movie_names = [id2name[i] for i in ids if i in id2name]  
    movie_names = list(set(movie_names))
    return movie_names

df['extracted_names'] = df['processed'].apply(extract_movie_names)

In [None]:
df.head()

In [None]:
# remove downvoted 
df2 = df[df['upvotes'].fillna(1) >= 0]  

In [None]:
df2['utc_time'] = pd.to_datetime(df2['utc_time'], unit='s').dt.tz_localize('UTC')

In [None]:
df2.head()

In [None]:
# knowledge cutoff 2022, 2023
mask = ~df2['utc_time'].dt.year.isin([2022, 2023])
df2 = df2[mask]

# remove comments without movie mentions
mask = df2['is_seeker'] | df2['extracted_names'].apply(lambda x: len(x) > 0)
df2 = df2[mask]

# request must be about movies
mask = df2['processed'].astype(str).str.contains('movie|Movie|film|Film')
df2 = df2[mask]

# written by seeker but actually it's a comment
mask = ~(df2['is_seeker'] & (df2['turn_order'] != 0))
df2 = df2[mask]

# consider only head comments
mask = (df2['turn_order'] < 2)
df2 = df2[mask]

In [None]:
len(df), len(df2)

In [None]:
df2.head()

In [None]:
df2['sub_id'] = df2['conv_id'].str.rsplit('_', n=1).str[0]

In [None]:
print(f"{len(set(df2['sub_id']))} submissions, {len(set(df2['conv_id']))} utterances") 

In [None]:
df2.head(10)

In [None]:
requests = list(set(df2[df2['is_seeker'] & (df2['turn_order'] == 0)]['raw']))

In [None]:
len(requests)

In [None]:
requests[:10]

### Prepare requests data

In [None]:
req_df = df2[df2['is_seeker'] & (df2['turn_order'] == 0)]
req_df['request'] = req_df['raw'].str.split(r"'USER',\s*").str[-1].str.rsplit(r'\\n\\n', n=1).str[0]

requests_df = pd.DataFrame({
    'sub_id': req_df['sub_id'],
    'request': req_df['request'],
    'movies_str': req_df['extracted_names']
})
requests_df = requests_df.drop_duplicates(subset=['sub_id'], keep='first')
requests_df['movies_str'] = requests_df['movies_str'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [None]:
len(requests_df)

In [None]:
requests_df.head()

In [None]:
save_dir = Path(f'../data/t4_requests/')
save_dir.mkdir(exist_ok=True, parents=True)

requests_df.to_csv(save_dir / f'processed_requests.csv', index=False)

### Prepare feedback data

In [None]:
request2comment_ids = {}

for _, group in df2.groupby('sub_id'):
    
    seeker_rows = group[group['is_seeker'] == True]
    
    if not seeker_rows.empty:    
        request_id = seeker_rows.iloc[0]['turn_id']
        comment_ids = group['turn_id'].tolist()
        
        comment_ids = [x for x in comment_ids if x != request_id]
        
        if len(comment_ids) >= 1:  # at least one comment
            request2comment_ids[request_id] = comment_ids

In [None]:
len(request2comment_ids)

In [None]:
id2movies = {}  # extracted_names 
id2context = {}  # raw

for index, row in df2.iterrows():
    
    turn_id = row['turn_id']
    
    raw = row['raw']
    id2context[turn_id] = ast.literal_eval(raw)[1].strip()
    
    if not row['is_seeker']:
        id2movies[turn_id] = row['extracted_names']

### true vs random 

In [None]:
all_comment_ids = set()
for comment_ids in request2comment_ids.values():
    all_comment_ids.update(comment_ids)

all_movies = set()
for movies in id2movies.values():
    all_movies.update(movies)

In [None]:
items_data = []
context_data = []

for request_id, comment_ids in tqdm(request2comment_ids.items()):
    
    # request
    request = id2context[request_id]

    # positive comment (first comment)
    first_comment_id = comment_ids[0]
    first_comment = id2context[first_comment_id]
    
    # random comment
    rand_comment_id = random.choice(list(all_comment_ids - set(comment_ids)))
    random_comment = id2context[rand_comment_id]

    # positive movies  
    first_movies = id2movies[first_comment_id]
    assert(len(first_movies) >= 1)
    
    # random movies (same amount)
    all_positive_movies = []
    for comment_id in comment_ids:
        all_positive_movies+= id2movies[comment_id]
        
    random_movies = random.sample(list(all_movies - set(all_positive_movies)), k=len(first_movies))
    
    items_data.append(
        {
            "request_id": request_id,
            "request": request,
            "first": ', '.join(first_movies),
            "random": ', '.join(random_movies)
        }
    )
    context_data.append(
        {
            "request_id": request_id,
            "request": request,
            "first": first_comment,
            "random": random_comment
        }
    )

In [None]:
items_large_df = pd.DataFrame(items_data)
context_large_df = pd.DataFrame(context_data)

In [None]:
items_large_df.head()

In [None]:
context_large_df.head()

In [None]:
items_df = items_large_df[items_large_df['request'].str.contains('request', case=False, na=False)]
context_df = context_large_df[context_large_df['request'].str.contains('request', case=False, na=False)]

In [None]:
len(items_df), len(context_df)

In [None]:
items_df.head()

In [None]:
context_df.head()

In [None]:
save_dir = Path(f'../data/t5_feedback/')
save_dir.mkdir(exist_ok=True, parents=True)

items_large_df.to_csv(save_dir / f'items-large.csv', index=False)
items_df.to_csv(save_dir / f'items.csv', index=False)

context_large_df.to_csv(save_dir / f'context-large.csv', index=False)
context_df.to_csv(save_dir / f'context.csv', index=False)