# Notebook to extract entities using annotations db

In [1]:
import os
import shutil
from PIL import Image

import sys
sys.path.insert(0, "../")
sys.path.insert(0, "../../hindsight/hindsight_server/")

from annotations_db import HindsightAnnotationsDB
from db import HindsightDB
from utils import make_dir

In [2]:
annotations_db = HindsightAnnotationsDB()
db = HindsightDB()

In [4]:
annotations = annotations_db.get_annotations()

In [5]:
annotations

Unnamed: 0,id,frame_id,x,y,w,h,rotation,label,conf,model_name,model_version,model_file_hash,parent_annotation_id
0,1,376317.0,0.000000,2216.183594,1087.765259,138.503662,0.0,twitter_bottom_menu,0.952205,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,
1,2,376317.0,903.131042,2008.995972,124.684021,125.122437,0.0,plus_button,0.887595,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,
2,3,376317.0,338.994812,446.116119,408.597290,82.501312,0.0,more_posted,0.821179,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,
3,4,376317.0,0.000000,2046.011230,1088.000000,174.036377,0.0,tweet,0.811268,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,
4,5,376317.0,0.000000,154.030319,1088.000000,243.673813,0.0,twitter_top_menu,0.810504,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26311,26312,,793.245544,15.970653,63.247559,50.891423,0.0,time_since_post,0.936331,YOLOv8_tweet_parse,v0.0,1fd2955ab036647e4f9efd652f0e030f,132.0
26312,26313,,152.836807,15.331071,258.964890,57.652045,0.0,username,0.934122,YOLOv8_tweet_parse,v0.0,1fd2955ab036647e4f9efd652f0e030f,132.0
26313,26314,,521.488953,1175.373657,164.585938,59.817383,0.0,likes,0.930023,YOLOv8_tweet_parse,v0.0,1fd2955ab036647e4f9efd652f0e030f,132.0
26314,26315,,152.413269,1179.432007,142.802002,57.154663,0.0,replies,0.914354,YOLOv8_tweet_parse,v0.0,1fd2955ab036647e4f9efd652f0e030f,132.0


In [141]:
annotations['x2'] = annotations['x'] + annotations['w']
annotations['y2'] = annotations['y'] + annotations['h']

In [142]:
frames = db.get_frames(frame_ids=set(annotations['frame_id']))
frames['filename'] = frames['path'].apply(lambda x: os.path.basename(x).split('.')[0])
frame_id_to_path = {f : p for f, p in zip(frames['id'], frames['path'])}

In [143]:
tweets = annotations.loc[annotations['label'] == "tweet"]
complete_y_min = 160
complete_y_max = 2200

def tweet_complete(row):
    im_annotations = annotations.loc[annotations['frame_id'] == row['frame_id']]
    all_im_annotations = set(im_annotations['label'])
    y_min = complete_y_min
    y_max = complete_y_max
    if "twitter_top_menu" in all_im_annotations:
        y_min += im_annotations.loc[im_annotations['label'] == "twitter_top_menu"].iloc[0]['y2']
    
    if "twitter_bottom_menu" in all_im_annotations:
        y_max -= im_annotations.loc[im_annotations['label'] == "twitter_bottom_menu"].iloc[0]['y']

    return row['y'] >= y_min and row['y2'] <= y_max

tweets['complete_tweet'] = tweets.apply(lambda row: tweet_complete(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['complete_tweet'] = tweets.apply(lambda row: tweet_complete(row), axis=1)


In [144]:
complete_tweets = tweets.loc[tweets['complete_tweet']]
partial_tweets = tweets.loc[~tweets['complete_tweet']]

In [145]:
tweets_dir = "../data/extracted_entities/tweets/"
complete_tweets_dir = os.path.join(tweets_dir, "complete")
make_dir(complete_tweets_dir)
partial_tweets_dir = os.path.join(tweets_dir, "partial")
make_dir(partial_tweets_dir)

for i, row in frames.iterrows():
    frame_annotations = complete_tweets.loc[complete_tweets['frame_id'] == row['id']]
    im = Image.open(row['path'])
    for i, an_row in frame_annotations.iterrows():
        e = im.crop([an_row['x'], an_row['y'], an_row['x2'], an_row['y2']])
        e.save(str(os.path.join(complete_tweets_dir, f"{row['filename']}_{an_row['id']}.jpg")))

for i, row in frames.iterrows():
    frame_annotations = partial_tweets.loc[partial_tweets['frame_id'] == row['id']]
    im = Image.open(row['path'])
    for i, an_row in frame_annotations.iterrows():
        e = im.crop([an_row['x'], an_row['y'], an_row['x2'], an_row['y2']])
        e.save(str(os.path.join(partial_tweets_dir, f"{row['filename']}_{an_row['id']}.jpg")))


# Deduplicate Complete Tweets

In [146]:
import imagehash

In [148]:
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            img = Image.open(os.path.join(folder, filename))
            images.append((img, filename))
    return images

In [149]:
def deduplicate_images(images, hash_size=8, max_distance=5):
    # hash_size controls the fineness of the hash, larger values mean finer hashes
    # max_distance defines how many bits may differ for images to be considered duplicates
    hashes = {}
    duplicates = []
    for img, filename in images:
        img_hash = imagehash.phash(img, hash_size=hash_size)
        # Compare this hash with hashes of all stored images
        found_duplicate = False
        for stored_hash in list(hashes.keys()):
            if img_hash - stored_hash <= max_distance:
                duplicates.append(filename)
                found_duplicate = True
                break
        if not found_duplicate:
            hashes[img_hash] = filename
    return hashes, duplicates

In [150]:
complete_tweets = load_images_from_folder(complete_tweets_dir)

In [151]:
hashes, duplicates = deduplicate_images(complete_tweets)

In [162]:
unique_tweets_dir = os.path.join(tweets_dir, "complete_unique")
make_dir(unique_tweets_dir)
for unique_image in set(hashes.values()):
    im_path = os.path.join(complete_tweets_dir, unique_image)
    dest_path = os.path.join(unique_tweets_dir, unique_image)
    shutil.copy(im_path, dest_path)

In [135]:
f = "com-twitter-android_1726321931966"

In [136]:
frames.loc[frames['filename'] == f]['path'].iloc[0]

'/Users/connorparish/.hindsight_server/data/raw_screenshots/2024/09/14/com-twitter-android/com-twitter-android_1726321931966.jpg'

In [137]:
frame_id = frames.loc[frames['filename'] == f]['id'].iloc[0]

In [138]:
tweets.loc[tweets['frame_id'] == frame_id]

Unnamed: 0,id,frame_id,x,y,w,h,rotation,label,conf,model_name,model_version,model_file_hash,x2,y2,complete_tweet
2678,2679,395176,0.0,183.125031,1088.0,628.504669,0.0,tweet,0.675312,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,1088.0,811.6297,True
2679,2680,395176,0.0,811.97345,1088.0,1462.484314,0.0,tweet,0.456058,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,1088.0,2274.457764,False


In [139]:
annotations.loc[annotations['frame_id'] == frame_id]

Unnamed: 0,id,frame_id,x,y,w,h,rotation,label,conf,model_name,model_version,model_file_hash,x2,y2
2678,2679,395176,0.0,183.125031,1088.0,628.504669,0.0,tweet,0.675312,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,1088.0,811.6297
2679,2680,395176,0.0,811.97345,1088.0,1462.484314,0.0,tweet,0.456058,YOLOv5_train162,v0.0,3c941935636ffbd14d2565d13f6c3abe,1088.0,2274.457764


In [None]:
not_complete = {"com-twitter-android_1726154294816", "com-twitter-android_1726321931966", "com-twitter-android_1726169561768",
                "com-twitter-android_1726344211810_3399.jpg", "com-twitter-android_1726021710347_362.jpg"}

In [None]:
is_complete = {"com-twitter-android_1726020422426_328", "com-twitter-android_1726020342516_499", "com-twitter-android_1726020476294_309", 
               "com-twitter-android_1726021652034_312", "com-twitter-android_1726021737264_562", "com-twitter-android_1726021743542_434", 
               "com-twitter-android_1726066409300_665", "com-twitter-android_1726066442024_643", "com-twitter-android_1726101283459_837",
               "com-twitter-android_1726095070668_704", "com-twitter-android_1726101285568_835", "com-twitter-android_1726147559072_1249"}  