In [1]:
import pandas as pd
import random

# Load and clean result.csv
df = pd.read_csv('result.csv')
df = df.drop_duplicates(subset='postId')
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Split into before/after election
df_before = df[df['timestamp'] < '2024-11-06']
df_after = df[df['timestamp'] > '2024-11-05']

# Sample up to 5000 rows for each
num_before = min(len(df_before), 5000)
num_after = min(len(df_after), 5000)
df_before_sampled = df_before.sample(n=num_before, random_state=42)
df_after_sampled = df_after.sample(n=num_after, random_state=42)

# Load and clean posts.csv
df_posts = pd.read_csv('posts.csv')
df_posts = df_posts.drop_duplicates(subset='postUrl')
df_posts = df_posts[(df_posts['likeCount'] >= 100) & (df_posts['commentCount'] >= 10)]

#  Merge (before/after) with df_posts on 'postUrl'
df_merged_before = pd.merge(
    df_before_sampled,
    df_posts,
    on='postUrl',
    how='inner',
    suffixes=('_left', '_right')
)
df_merged_after = pd.merge(
    df_after_sampled,
    df_posts,
    on='postUrl',
    how='inner',
    suffixes=('_left', '_right')
)

# Subset columns to keep, removing duplicates based on 'postUrl'
cols_to_keep = [
    'postUrl',
    'timestamp_left',
    'username_left',
    'fullName_left',
    'description_left',
    'likeCount_left',
    'commentCount_left',
    'location_left',
    'matches'
]

df_merged_before = df_merged_before[cols_to_keep].drop_duplicates(subset='postUrl')
df_merged_after  = df_merged_after[cols_to_keep].drop_duplicates(subset='postUrl')

# Rename for clarity
rename_map = {
    'timestamp_left': 'timestamp',
    'username_left': 'username',
    'fullName_left': 'fullName',
    'description_left': 'caption',
    'likeCount_left': 'likeCount',
    'commentCount_left': 'commentCount',
    'location_left': 'location'
}
df_merged_before.rename(columns=rename_map, inplace=True)
df_merged_after.rename(columns=rename_map, inplace=True)

# Save CSVs
df_merged_before.to_csv('final_before.csv', index=False)
df_merged_after.to_csv('final_after.csv', index=False)