# Extract tweet entities from annotations db

In [7]:
import os
import time
import shutil
import random
from PIL import Image
import pandas as pd

import sys
sys.path.insert(0, "/Users/connorparish/code/hindsight")
sys.path.insert(0, "../../../hindsight/hindsight_server/")
from annotation_helpers import add_hindsight_frame_path, get_entity_image, visualize_annotations, get_entity_image
from annotations_db import HindsightAnnotationsDB

from db import HindsightDB
from utils import make_dir, ocr_results_to_str, add_datetimes

In [8]:
model_version = "v0.1"

In [9]:
annotations_db = HindsightAnnotationsDB()
db = HindsightDB()

In [10]:
all_annotations = annotations_db.get_annotations()
all_annotations['x2'] = all_annotations['x'] + all_annotations['w']
all_annotations['y2'] = all_annotations['y'] + all_annotations['h']

In [11]:
# all_tweet_annotations = all_annotations.loc[~all_annotations['parent_annotation_id'].isnull()]
all_tweet_annotations = all_annotations.loc[all_annotations['model_version'] == model_version]
all_tweet_annotations = all_tweet_annotations.loc[~all_tweet_annotations['label'].isnull()]
frame_annotations = all_annotations .loc[~all_annotations['frame_id'].isnull()]
annotated_frame_ids = {int(i) for i in frame_annotations['frame_id']}
all_ocr_res = db.get_frames_with_ocr(frame_ids=annotated_frame_ids)
all_ocr_res['x2'] = all_ocr_res['x'] + all_ocr_res['w']
all_ocr_res['y2'] = all_ocr_res['y'] + all_ocr_res['h']

In [12]:
def compute_overlap_and_positions(row, annotation_row):
    # Coordinates of the OCR rectangle
    ocr_x1 = row['x']
    ocr_y1 = row['y']
    ocr_x2 = row['x2']
    ocr_y2 = row['y2']
    
    # Coordinates of the overlapping area
    overlap_x1 = max(ocr_x1, annotation_row['x'])
    overlap_y1 = max(ocr_y1, annotation_row['y'])
    overlap_x2 = min(ocr_x2, annotation_row['x2'])
    overlap_y2 = min(ocr_y2, annotation_row['y2'])
    
    # Width and height of the OCR rectangle
    ocr_width = ocr_x2 - ocr_x1
    ocr_height = ocr_y2 - ocr_y1
    
    # Width and height of the overlapping area
    overlap_width = max(0, overlap_x2 - overlap_x1)
    overlap_height = max(0, overlap_y2 - overlap_y1)
    
    # Check for no overlap or invalid dimensions
    if overlap_width <= 0 or overlap_height <= 0 or ocr_width <= 0 or ocr_height <= 0:
        return None  # No overlap or invalid rectangle dimensions
    
    # Assuming left-to-right text direction
    # Relative start and end positions along the x-axis
    relative_start = (overlap_x1 - ocr_x1) / ocr_width
    relative_end = (overlap_x2 - ocr_x1) / ocr_width
    
    # Ensure relative positions are within [0,1]
    relative_start = max(0, min(1, relative_start))
    relative_end = max(0, min(1, relative_end))
    
    return relative_start, relative_end

def extract_text_within_overlap(row):
    text = row['text']
    relative_start, relative_end = row['relative_positions']
    
    # Total length of the text
    text_length = len(text)
    
    # Calculate character indices corresponding to the overlap
    start_char = int(text_length * relative_start)
    end_char = int(text_length * relative_end)
    
    # Ensure indices are within the bounds of the text
    start_char = max(0, min(text_length, start_char))
    end_char = max(0, min(text_length, end_char))
    
    # Extract the text segment
    extracted_text = text[start_char:end_char]
    
    return extracted_text

def get_sub_df(df, annotation_row):
    df = df.copy()
    
    # Compute overlap positions
    overlap_results = df.apply(
        lambda row: compute_overlap_and_positions(row, annotation_row), axis=1)
    
    # Remove rows with no overlap
    df = df[overlap_results.notnull()]
    df['relative_positions'] = overlap_results.dropna()
    
    if len(df) == 0:
        return pd.DataFrame()
    # Extract the portion of text within the overlapping area
    df['extracted_text'] = df.apply(lambda row: extract_text_within_overlap(row), axis=1)
    
    # Adjust coordinates relative to the annotation rectangle (optional)
    df['x'] = df['x'] - annotation_row['x']
    df['y'] = df['y'] - annotation_row['y']
    df['x2'] = df['x2'] - annotation_row['x']
    df['y2'] = df['y2'] - annotation_row['y']
    
    return df

In [13]:
tweet_labels = {'ad_icon',
 'community_notes',
 'image_content_source',
 'impresssions',
 'likes',
 'more_posted',
 'quoted_tweet',
 'replies',
 'retweets',
 'time_since_post',
 'tweet_image_content',
 'tweet_text',
 'user_association_image',
 'user_handle',
 'user_image',
 'username',
 'verified_check'}

In [14]:
tweet_text_labels = {
 'community_notes',
 'image_content_source',
 'impresssions',
 'likes',
 'replies',
 'retweets',
 'time_since_post',
 'tweet_text',
 'user_handle',
 'username'}

tweet_binary_labels = {
    "ad_icon",
    "verified_check",
    "user_association_image",
    "user_image",
    "tweet_image_content"
}

In [15]:
def dedupe_tweet_labels(labels_df):
    labels_df = labels_df.copy()
    labels_df = labels_df.sort_values(by=["y"], ascending=True)
    return labels_df.drop_duplicates(subset=['label'], keep="first")
    

In [16]:
parsed_tweets = list()
for parent_annotations_id in set(all_tweet_annotations['parent_annotation_id']):
    # if len(parsed_tweets) > 50:
    #     continue
    parent_annotation_row = all_annotations.loc[all_annotations['id'] == parent_annotations_id].iloc[0]
    frame_ocr_res = all_ocr_res.loc[all_ocr_res['frame_id'] == int(parent_annotation_row['frame_id'])]
    tweet_ocr_res = get_sub_df(frame_ocr_res, parent_annotation_row)

    tweet_d = {"parent_annotations_id" : int(parent_annotations_id)}
    tweet_annotations = all_tweet_annotations.loc[all_tweet_annotations['parent_annotation_id'] == parent_annotations_id]
    
    all_tweet_annotation_labels = set(tweet_annotations['label'])
    if "quoted_tweet" in all_tweet_annotations:
        quoted_tweet_row = all_tweet_annotations.loc[all_tweet_annotations['label'] == "quoted_tweet"].iloc[0]
        quoted_tweet_annotations = get_sub_df(tweet_annotations, quoted_tweet_row)
        quoted_tweet_annotations_ids = set(quoted_tweet_annotations_ids['id'])
        tweet_annotations = tweet_annotations.loc[~tweet_annotations['id'].isin(quoted_tweet_annotations_ids)] # Remove quoted tweet annotations from tweet_annotations
        quoted_tweet_annotations = quoted_tweet_annotations.loc[quoted_tweet_annotations['label'] != "quoted_tweet"]
        quoted_tweet_annotations_dedupe = dedupe_tweet_labels(quoted_tweet_annotations_dedupe)

        quoted_tweet_ocr_res = get_sub_df(tweet_ocr_res, quoted_tweet_row)
        
        quoted_tweet_d = {}
        for i, row in quoted_tweet_annotations_dedupe.iterrows():
            if row['label'] in tweet_binary_labels:
                quoted_tweet_d[row['label']] = True
            else:
                label_ocr_res = get_sub_df(quoted_tweet_ocr_res, row)
                if len(label_ocr_res) > 0:
                    quoted_tweet_d[row['label']] = " ".join(label_ocr_res['extracted_text'])
                else:
                    quoted_tweet_d[row['label']] = -1
        tweet_d["quoted_tweet"] = quoted_tweet_d
        tweet_annotations = tweet_annotations.loc[tweet_annotations['label'] != "quoted_tweet"] # In case multiple quoted_tweet labels

    tweet_annotations = dedupe_tweet_labels(tweet_annotations)
    for i, row in tweet_annotations.iterrows():
        if row['label'] in tweet_binary_labels:
            tweet_d[row['label']] = True
        else:
            label_ocr_res = get_sub_df(tweet_ocr_res, row)
            if len(label_ocr_res) > 0:
                tweet_d[row['label']] = " ".join(label_ocr_res['extracted_text'])
            else:
                tweet_d[row['label']] = -1 # For when there is an obj detected but no ocr results
    
    parsed_tweets.append(tweet_d)
    

In [22]:
frame_id_to_timestamp = {f : t for f, t in zip(all_ocr_res['frame_id'], all_ocr_res['timestamp'])}
parent_annotation_id_to_frame_id = {i : f for i, f in zip(all_annotations['id'], all_annotations['frame_id'])}

In [23]:
parsed_tweets_df = pd.DataFrame(parsed_tweets)

In [24]:
parsed_tweets_df['frame_id'] = parsed_tweets_df['parent_annotations_id'].map(parent_annotation_id_to_frame_id)
parsed_tweets_df['timestamp'] = parsed_tweets_df['frame_id'].map(frame_id_to_timestamp)

In [25]:
parsed_tweets_df.to_csv(f"all_tweets-{int(time.time())}.csv", index=False)

In [405]:
needed_columns = {
 'impresssions',
 'likes',
 'replies',
 'retweets',
 'time_since_post',
 'user_handle',
 'user_image',
 'username'}
cleaned_parsed_tweets_df = parsed_tweets_df.dropna(subset=needed_columns)

In [265]:
dropped_tweets = set(parsed_tweets_df['parent_annotations_id']) - set(cleaned_parsed_tweets_df['parent_annotations_id'])

In [338]:
cleaned_parsed_tweets_df['5min_interval'] = cleaned_parsed_tweets_df.groupby(pd.Grouper(key='datetime_local', freq='5T'))['datetime_local'].transform('min')

  cleaned_parsed_tweets_df['5min_interval'] = cleaned_parsed_tweets_df.groupby(pd.Grouper(key='datetime_local', freq='5T'))['datetime_local'].transform('min')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_parsed_tweets_df['5min_interval'] = cleaned_parsed_tweets_df.groupby(pd.Grouper(key='datetime_local', freq='5T'))['datetime_local'].transform('min')


In [339]:
dupe_columns = {
 'tweet_text',
 'username',
 'tweet_image_content',
 '5min_interval'}
# Add time and drop if within 5 mins
cleaned_parsed_tweets_df = cleaned_parsed_tweets_df.drop_duplicates(subset=dupe_columns, keep="first")

In [341]:
cleaned_parsed_tweets_df.to_csv("extracted_tweets_1.csv", index=False)

# Debugging

In [413]:
# parent_annotations_id = random.choice(list(dropped_tweets))
parent_annotations_id = 120943

In [414]:
parent_annotation_row = all_annotations.loc[all_annotations['id'] == parent_annotations_id].iloc[0]
frame_ocr_res = all_ocr_res.loc[all_ocr_res['frame_id'] == int(parent_annotation_row['frame_id'])]
tweet_ocr_res = get_sub_df(frame_ocr_res, parent_annotation_row)

ValueError: cannot convert float NaN to integer

In [393]:
def get_entity_image(entity_row):
    if "frame_path" in entity_row:
        im = Image.open(entity_row['frame_path'])
    else:
        im_path = db.get_frames(frame_ids=[entity_row['frame_id']]).iloc[0]['path']
        im = Image.open(im_path)
    e = im.crop([entity_row['x'], entity_row['y'], entity_row['x2'], entity_row['y2']])
    e.filename = str(entity_row['id'])
    return e

In [394]:
tweet_annotations = all_tweet_annotations.loc[all_tweet_annotations['parent_annotation_id'] == parent_annotations_id]

In [395]:
e = get_entity_image(parent_annotation_row)

In [396]:
viz_e = visualize_annotations(e, tweet_annotations)

In [397]:
viz_e.show()

In [None]:
# Annotate
4686, 195399, 135018, 163161, 196444, 138