In [37]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
import seaborn as sns
sns.set_theme(font_scale=1.2,
        style="ticks",
        rc={
        "text.usetex": True,
        'text.latex.preamble': r'\usepackage{amsfonts}',
        "font.family": "serif",
    })

In [4]:
def load_csv_files_to_dataframe(directory, columns=None):
    """
    Reads all CSV files in the specified directory and combines them into a single pandas DataFrame.

    Parameters:
        directory (str): The path to the directory containing the CSV files.

    Returns:
        pd.DataFrame: A DataFrame containing the combined data from all CSV files.
    """
    # List to store dataframes
    dataframes = []

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):   
            if "random" in filename:
                continue
            filepath = os.path.join(directory, filename)
            print(f"Loading file: {filepath}")
            # Read the CSV file and append to the list
            dataframes.append(pd.read_csv(filepath, usecols=columns))

    # Concatenate all dataframes into one
    combined_dataframe = pd.concat(dataframes, ignore_index=True)
    return combined_dataframe

In [64]:
df = load_csv_files_to_dataframe("KuaiRand-Harm/data", ["play_time_ms", "is_hate", "duration_ms", "video_id", "is_click", "user_id", "date", "hourmin", "time_ms", "is_rand"])

Loading file: KuaiRand-Harm/data/log_standard_4_08_to_4_21_27k_part2.csv
Loading file: KuaiRand-Harm/data/log_standard_4_08_to_4_21_27k_part1.csv
Loading file: KuaiRand-Harm/data/log_standard_4_22_to_5_08_27k_part1.csv
Loading file: KuaiRand-Harm/data/log_standard_4_22_to_5_08_27k_part2.csv


In [65]:
dt = pd.to_datetime(df["time_ms"], unit='ms').dt.tz_localize('Asia/Singapore').dt.tz_convert("ETC/GMT-8")
df["timestamp"] = dt
df.drop(columns=["date", "hourmin", "time_ms"], inplace=True)
df = df[df.is_click == 1]

In [67]:
# Filter a bit to ensure we have reasonable data
original_size = len(df)
print("Total interaction size: ", len(df))

# Keep only elements with a duration greater than zero.
df = df[df.duration_ms > 0]
df["fraction_play_time"] = df.play_time_ms/df.duration_ms

# Remove users which now do not have any negative video flagged
harmful_users = df[df['is_hate'] == 1]['user_id'].unique()
df = df[df['user_id'].isin(harmful_users)]

reduction_size = len(df)
print("Final interaction size: ", len(df))
print("Reduction", 1-reduction_size/original_size) # We basically loose around 12 % of interactions
print("Unique videos: ", len(df.video_id.unique()))

Total interaction size:  24462781


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["fraction_play_time"] = df.play_time_ms/df.duration_ms


Final interaction size:  18950714
Reduction 0.22532462682799637
Unique videos:  3875888


In [76]:
# Identify video_ids that have been seen at least twice by one user
videos_seen_twice = df.groupby(["user_id", "video_id"]).size().reset_index(name='count')
videos_to_keep = videos_seen_twice[videos_seen_twice['count'] >= 2]["video_id"].unique()

# Filter the original dataframe to include all rows with those video_ids
df = df[df["video_id"].isin(videos_to_keep)]

In [77]:
print("Final interaction length: ", len(df))
print("[*] Reduction", 1-len(df)/original_size)

Final interaction length:  3645375
[*] Reduction 0.8509828052664985


In [105]:
# Re-index all users and videos, since we might have removed some
df['user_id'], mapping_user_id = pd.factorize(df['user_id'])
df['video_id'], mapping_video_id = pd.factorize(df['video_id'])

In [106]:
# Now, we separate repeated and non-repeated videos
df = df.sort_values(by=["user_id", "video_id", "timestamp"])

# Create DataFrame with only the first repeated occurrences
repeated_videos = df[df.duplicated(subset=["user_id", "video_id"], keep=False)].groupby(["user_id", "video_id"]).nth(1).reset_index()

# Create DataFrame with videos seen only once
seen_once_videos = df.drop_duplicates(subset=["user_id", "video_id"], keep="first")

In [108]:
repeated_videos.drop(columns=["index"], inplace=True)

In [125]:
merged_df = pd.merge(seen_once_videos, repeated_videos[["user_id", "video_id", "is_hate", "fraction_play_time", 'play_time_ms']],
                    on=['user_id', 'video_id'],
                   how="left")

In [None]:
merged_df.rename(columns={
    "is_hate_x": "is_hate",
    "fraction_play_time_x": "fraction_play_time"
}, inplace=True)

merged_df.to_csv("KuaiRand-Harm/training/single_and_repeated_interactions_is_click.csv.gzip", index=None, compression='gzip')