In [66]:
import os
import shutil
import cv2
import glob

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [67]:
RAW_SCREENSHOTS_DIR = '/Users/connorparish/projects/hindsight/pixel8_screenshots_o'
PROCESSED_SCREENSHOTS_DIR = '/Users/connorparish/projects/hindsight/pixel8_screenshots'

In [77]:
def generate_images_df(image_dir):
    images_l = list()
    for f in glob.glob(f"{image_dir}/*.jpg"):
        filename = f.split('/')[-1]
        filename_s = filename.replace(".jpg", "").split("_")
        application = filename_s[0]
        timestamp = int(filename_s[1])
        images_l.append({"path" : f, "timestamp" : timestamp, "app" : application})
    return pd.DataFrame(images_l)

In [78]:
images_df = generate_images_df(image_dir=RAW_SCREENSHOTS_DIR).sort_values(by="timestamp", ascending=True).reset_index(drop=True)

In [79]:
def get_image_diff(frame_0_gray, frame_1_gray):
    frame_diff = cv2.absdiff(frame_0_gray, frame_1_gray)
    frame_diff = frame_diff.astype(np.uint8)
    diff_percentage = np.count_nonzero(frame_diff) / frame_diff.size
    return diff_percentage

In [80]:
DIFF_THRESH = 0.05

unique_images = list(images_df.iloc[0]['path'])
prev_image = None
for i, row in images_df.iterrows():
    image = cv2.imread(row['path'], 0)
    if prev_image is not None:
        diff = get_image_diff(prev_image, image)
        if diff > DIFF_THRESH:
            unique_images.append(row['path'])
            prev_image = image
    else:
        prev_image = image

In [81]:
len(unique_images)

6356

In [82]:
unique_images_df = images_df.loc[images_df['path'].isin(unique_images)]

In [83]:
unique_images_df['datetime'] = pd.to_datetime(unique_images_df['timestamp'] / 1000, unit='s', utc=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_images_df['datetime'] = pd.to_datetime(unique_images_df['timestamp'] / 1000, unit='s', utc=True)


In [85]:
unique_images_df['day_dir'] = unique_images_df.apply(lambda row: os.path.join(PROCESSED_SCREENSHOTS_DIR, f"{row['datetime'].strftime('%Y/%m/%d')}/{row['app']}/"), axis=1)
unique_images_df['filename'] = unique_images_df['path'].apply(lambda x: x.split('/')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_images_df['day_dir'] = unique_images_df.apply(lambda row: os.path.join(PROCESSED_SCREENSHOTS_DIR, f"{row['datetime'].strftime('%Y/%m/%d')}/{row['app']}/"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_images_df['filename'] = unique_images_df['path'].apply(lambda x: x.split('/')[-1])


In [86]:
def make_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)

for d in set(unique_images_df['day_dir']):
    make_dir(d)

In [87]:
for i, row in unique_images_df.iterrows():
    dest_f = os.path.join(row['day_dir'], row['filename'])
    if not os.path.exists(dest_f):
        shutil.copyfile(row['path'], dest_f)