In [1]:
import os
import cv2
import json
import yaml 
import pickle
import requests
import pytesseract
# bc I played with it before: https://colab.research.google.com/drive/1uk_pc2HdYfu8i3IpmONKft2j97eo6vzU#scrollTo=pazxU4FPg8sO
# but also, rekognition? https://www.kite.com/blog/python/converting-screenshots-into-data-using-ocr-aws-rekognition/

import pandas as pd

from PIL import Image
from google_auth_oauthlib.flow import Flow, InstalledAppFlow
from googleapiclient.discovery import build
from google.auth.transport.requests import Request

In [2]:
# source: https://github.com/polzerdo55862/google-photos-api/blob/27a0ec0d99e0acfc101c2b31dbc6cd967c27e5d6/Google_API.ipynb
class GooglePhotosApi:
    def __init__(self,
                 api_name = 'photoslibrary',
                 client_secret_file= r'./credentials/client_secret.json',
                 api_version = 'v1',
                 scopes = ['https://www.googleapis.com/auth/photoslibrary']):
        '''
        Args:
            client_secret_file: string, location where the requested credentials are saved
            api_version: string, the version of the service
            api_name: string, name of the api e.g."docs","photoslibrary",...
            api_version: version of the api

        Return:
            service:
        '''

        self.api_name = api_name
        self.client_secret_file = client_secret_file
        self.api_version = api_version
        self.scopes = scopes
        self.cred_pickle_file = f'./credentials/token_{self.api_name}_{self.api_version}.pickle'

        self.cred = None
        self.base_out_dir = "./raw_screenshots"

    def run_local_server(self):
        # is checking if there is already a pickle file with relevant credentials
        if os.path.exists(self.cred_pickle_file):
            with open(self.cred_pickle_file, 'rb') as token:
                self.cred = pickle.load(token)

        # if there is no pickle file with stored credentials, create one using google_auth_oauthlib.flow
        if not self.cred or not self.cred.valid:
            if self.cred and self.cred.expired and self.cred.refresh_token:
                self.cred.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(self.client_secret_file, self.scopes)
                self.cred = flow.run_local_server()

            with open(self.cred_pickle_file, 'wb') as token:
                pickle.dump(self.cred, token)
        
        return self.cred
    
    def get_screenshots(self, year, month):  # , month, day
        url = 'https://photoslibrary.googleapis.com/v1/mediaItems:search'
        payload = {
                      "pageSize": "100",
                      "filters": {
                      "contentFilter": {
                          "includedContentCategories": [
                            "SCREENSHOTS"
                          ]
                        },
                        "dateFilter": {
                          "dates": [
                            {
    #                           "day": day,
                              "month": month,
                              "year": year
                            }
                          ]
                        }
                      }
                    }
        headers = {
            'content-type': 'application/json',
            'Authorization': 'Bearer {}'.format(self.cred.token)
        }

        try:
            res = requests.request("POST", url, data=json.dumps(payload), headers=headers)
        except:
            print('Request error') 

        return(res)
    
    def get_and_save_screenshots(self, year, month):
        if str(year) not in os.listdir(self.base_out_dir):
            os.mkdir(f"{self.base_out_dir}/{year}")
        os.mkdir(f"{self.base_out_dir}/{year}/{month}")
        response = self.get_screenshots(year, month)
        for item in response.json()['mediaItems']:
            item_response = requests.get(item['baseUrl'])
            with open(os.path.join(f"{self.base_out_dir}/{year}/{month}/", item['filename']), 'wb') as f:
                f.write(item_response.content)
                f.close()
        return len(response.json()['mediaItems'])
    

In [74]:
google_photos_api = GooglePhotosApi()
google_photos_api.run_local_server()

for year in [2020, 2021, 2022]:
    for month_less_one in range(12):
        month = month_less_one+1
        print(f"getting screenshots from {year}/{month}")
        n_saved = google_photos_api.get_and_save_screenshots(year, month)
        print(f"saved {n_saved} screenshots")

getting screenshots from 2020/1
saved 39 screenshots
getting screenshots from 2020/2
saved 48 screenshots
getting screenshots from 2020/3
saved 65 screenshots
getting screenshots from 2020/4
saved 100 screenshots
getting screenshots from 2020/5
saved 100 screenshots
getting screenshots from 2020/6
saved 78 screenshots
getting screenshots from 2020/7
saved 21 screenshots
getting screenshots from 2020/8
saved 59 screenshots
getting screenshots from 2020/9
saved 100 screenshots
getting screenshots from 2020/10
saved 58 screenshots
getting screenshots from 2020/11
saved 82 screenshots
getting screenshots from 2020/12
saved 100 screenshots
getting screenshots from 2021/1
saved 100 screenshots
getting screenshots from 2021/2
saved 44 screenshots
getting screenshots from 2021/3
saved 100 screenshots
getting screenshots from 2021/4
saved 100 screenshots
getting screenshots from 2021/5
saved 100 screenshots
getting screenshots from 2021/6
saved 100 screenshots
getting screenshots from 2021/7
sa

# information extraction

## what do we care about?
figuring this out will be iterative as we see what kinds of images we're working with
adding lists of simple strings to match too, for now

In [203]:
SOURCES = yaml.safe_load(open('configs/sources.yml'))
CONTENT_TYPES = yaml.safe_load(open('configs/content_types.yml'))

## OCR

In [34]:
# # # Helpful for testing things
# # yields the right colors
# Image.open(f"raw_screenshots/2020/11/{img_list[1]}")
# # yields the same image with weird colors
# Image.fromarray(df.head(2)['raw_vec'][1])
# # yields OCR result
# pytesseract.image_to_string(Image.open(f"raw_screenshots/2020/11/{img_list[1]}"))
# # yields same OCR result
# pytesseract.image_to_string(Image.fromarray(df.head(2)['raw_vec'][1]))

In [5]:
# image_list = os.listdir("raw_screenshots/2022/7/")  # reddit, duolingo, 
# image_list = os.listdir("raw_screenshots/2020/11/")  # random shopping, home decor
img_list = os.listdir("raw_screenshots/2020/11/")  # music, shopping, food, decor, book recs
print(img_list[0])
print(len(img_list))

IMG_2818.PNG
81


In [6]:
# https://nanonets.com/blog/ocr-with-tesseract/

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [7]:
df = pd.DataFrame(img_list, columns=['filename'])
df.head(1)

Unnamed: 0,filename
0,IMG_2818.PNG


In [8]:
def get_img_vector(filename):
    return cv2.imread(f"raw_screenshots/2020/11/{filename}")
df['raw_vec'] = df.filename.apply(get_img_vector)
df.head(1)

Unnamed: 0,filename,raw_vec
0,IMG_2818.PNG,"[[[250, 250, 250], [250, 250, 250], [250, 250,..."


In [79]:
df['gray_vec'] = df.raw_vec.apply(get_grayscale)
df['gray_nr_vec'] = df.gray_vec.apply(remove_noise)
df['gray_nr_thresh_vec'] = df.gray_nr_vec.apply(thresholding)
# order of application probably matters, should test (and read about it more)

df.head(1)

Unnamed: 0,filename,raw_vec,gray_vec,gray_nr_vec,gray_nr_thresh_vec,raw_text,gray_text,gray_nr_text,gray_nr_thresh_text,nr_vec
0,IMG_2818.PNG,"[[[250, 250, 250], [250, 250, 250], [250, 250,...","[[250, 250, 250, 250, 250, 250, 250, 250, 250,...","[[250, 250, 250, 250, 250, 250, 250, 250, 250,...","[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",,,,,"[[[250, 250, 250], [250, 250, 250], [250, 250,..."


In [188]:
def get_text(img_data):
    return pytesseract.image_to_string(Image.fromarray(img_data)).replace("\n", " ")

In [189]:
df['raw_text'] = df.raw_vec.apply(get_text)
df['gray_text'] = df.gray_vec.apply(get_text)
df['gray_nr_text'] = df.gray_nr_vec.apply(get_text)
df['gray_nr_thresh_text'] = df.gray_nr_thresh_vec.apply(get_text)
# df.iloc[:5, 5:]
df.iloc[:5, [0,5,6,7,8]]

Unnamed: 0,filename,raw_text,gray_text,gray_nr_text,gray_nr_thresh_text
0,IMG_2818.PNG,,,,
1,IMG_2763.PNG,Saadia Naeem @SaadiaNaeem23-1h + Replying to @...,Saadia Naeem @SaadiaNaeem23-1h -- Replying to ...,,
2,IMG_3482.PNG,4d Te sor Oca Qu,4d Ts —Qso19 @ Gone Qu,,
3,IMG_3483.PNG,em mmattiokahn® ‘@mattiekann New York will n...,em mattiokahn® ‘@mattiekann New York will ne...,etn con® ee ee eed te rise recone! eta He Qa e...,sent i ee ey tee Jariien ered ea Be at A eae e...
4,IMG_2762.PNG,ee Follow) ‘Typos of the New York Times anytt...,‘Typos of the New York Times |am an appellate...,,


In [None]:
# Hypothesis: results are garbage because Google Photos are compressed 
    # and the poor quality makes it harder to recognize text

In [None]:
def reconcile_text():
    """
    reconcile the text in all the different fields
    probs not necessary, we may just want to pick the best image processing approach instead
    idk
    """
    pass

In [213]:
def get_all_tokens(row):
    all_tokens = set(
        [x.lower() for x in (row['raw_text'] + row['gray_text'] + row['gray_nr_text'] + row['gray_nr_thresh_text']).split(" ")]
    )
    return all_tokens

def match_tokens(tokens, content=False, sources=False):
    assert content + sources <= 1
    match_dict = CONTENT_TYPES if content else SOURCES
    matches = []
    for elem in match_dict:
        matches.extend(list(set(match_dict[elem]).intersection(tokens)))
    if len(matches) > 0:
        return matches
    else:
        return None

def match_content_tokens(tokens):
    return match_tokens(tokens, content=True)

def match_source_tokens(tokens):
    return match_tokens(tokens, sources=True)

In [214]:
df['all_tokens'] = df.apply(get_all_tokens, axis=1)
df.iloc[:5, [0,5,6,7,8,9]]

Unnamed: 0,filename,raw_text,gray_text,gray_nr_text,gray_nr_thresh_text,all_tokens
0,IMG_2818.PNG,,,,,{}
1,IMG_2763.PNG,Saadia Naeem @SaadiaNaeem23-1h + Replying to @...,Saadia Naeem @SaadiaNaeem23-1h -- Replying to ...,,,"{, 2, @nyttypos, to, ray, --, 9), ., @2, +, a,..."
2,IMG_3482.PNG,4d Te sor Oca Qu,4d Ts —Qso19 @ Gone Qu,,,"{, sor, te, —qso19, @, ts, qu, oca, 4d, gone}"
3,IMG_3483.PNG,em mmattiokahn® ‘@mattiekann New York will n...,em mattiokahn® ‘@mattiekann New York will ne...,etn con® ee ee eed te rise recone! eta He Qa e...,sent i ee ey tee Jariien ered ea Be at A eae e...,"{, emia, q, just, up, “021, les, web, with, my..."
4,IMG_2762.PNG,ee Follow) ‘Typos of the New York Times anytt...,‘Typos of the New York Times |am an appellate...,,,"{, lassume, have, q, cringer-, ofthe, out,, my..."


In [215]:
df['content'] = df.all_tokens.apply(match_content_tokens)
df['sources'] = df.all_tokens.apply(match_source_tokens)
# df.iloc[:5, [0,5,6,7,8,9, 10, 11]]
df.iloc[:5, [10, 11]]

Unnamed: 0,content,sources
0,,
1,,
2,,
3,,[twitter]
4,[reading],[tweet]


In [216]:
df.content.explode().unique()

array([None, 'reading', 'series', 'show', 'story', 'book', 'read',
       'confirmation'], dtype=object)

In [217]:
df[['filename', 'content']].dropna()

Unnamed: 0,filename,content
4,IMG_2762.PNG,[reading]
12,IMG_3282.PNG,[series]
23,IMG_3486.PNG,[show]
25,IMG_3608.PNG,"[story, book, read]"
34,IMG_3593.PNG,"[show, confirmation]"
63,IMG_3598.PNG,[show]


In [218]:
df.sources.explode().unique()

array([None, 'twitter', 'tweet', 'instagram'], dtype=object)

In [219]:
df[['filename', 'sources']].dropna()

Unnamed: 0,filename,sources
3,IMG_3483.PNG,[twitter]
4,IMG_2762.PNG,[tweet]
15,IMG_3485.PNG,[twitter]
25,IMG_3608.PNG,[twitter]
27,IMG_3609.PNG,"[tweet, twitter]"
31,IMG_3597.PNG,[tweet]
51,IMG_3602.PNG,"[tweet, twitter]"
53,IMG_2910.PNG,[tweet]
56,IMG_3607.PNG,"[tweet, twitter]"
59,IMG_3604.PNG,[twitter]


So, not super effective yet. But solid starting infrastructure.

### add Spanish capability

In [220]:
custom_config = r'-l spa+eng --psm 6'
pytesseract.image_to_string(img, config=custom_config)

*Note*: Unfortunately tesseract does not have a feature to detect language of the text in an image automatically. An alternative solution is provided by another python module called langdetect which can be installed via pip.
```
$ pip install langdetect
```
ex:

```
custom_config = r'-l eng+por --psm 6'
txt = pytesseract.image_to_string(img, config=custom_config)

from langdetect import detect_langs
detect_langs(txt)
```
This should output a list of languages in the text and their probabilities. The language codes used by langdetect follow ISO 639-1 codes.


## some image recognition should be useful here
for things like RTR screenshots, maybe lulureddit/reddit source generally, audio progress bar, 

# screenshotting activity by month
- counts
- but also eventually themes?