## PPTSHOTS


Finding sensitive information in the trimmed parts of cropped images 


Cropping pictures inserted in a powerpoint presentation enables users to hide parts of a picture that they do not want to display. The problem is that Office’s cropping tool only modifies how the cropped image is displayed in the body of the document. The original picture remains intact. Cropped portions of the image are not completely removed from the document and can be seen by others if the file is uploaded to the internet. Data leakage can occur if there is sensitive data in the trimmed areas.

PPTSHOTS searches google for presentations by query, downloads them and checks for images where cropping has occured.

This solution uses goog.io, They have free and commercial packages available. Enter your key in the .env file

It is advised that you run the notebook in a sandbox or vm as it does involve downloading untrusted documents from the internet.


Clone the repository

`git clone https://github.com/dfaram7/pptshots.git`


Install the requirements

`pip install -r requirements.txt`

Run the notebook!

`jupyter notebook`

If you dont want to read all the code you can just SHIFT+ENTER down to th eexclamation marks and enter your search term

In [20]:
from pptx import Presentation
import requests
import urllib.request
import xml.dom.minidom                                        
from time import sleep
import random
import tldextract
import urllib
import wget
from dotenv import load_dotenv
import os
import re
from lxml import etree
import zipfile
import csv

In [5]:
load_dotenv() #loads variables from .env
key = os.getenv('GOOG') #loads api key - get a free account at goog.io

In [6]:
# set the api key in headers
headers = {
    "apikey": key
}


In [21]:
#funtion that processes a pptx file and saves the images in img_folder
def process(pptx, img_dir=None):
    
    # unzip the docx in memory
    zipf = zipfile.ZipFile(re.sub(r'[\\/*?:"<>|]',"", pptx))
    filelist = zipf.namelist()

    if img_dir is not None:
        # extract images
        for fname in filelist:
            _, extension = os.path.splitext(fname)
            if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
                dst_fname = os.path.join(img_dir, os.path.basename(fname))
                with open(dst_fname, "wb") as dst_f:
                    dst_f.write(zipf.read(fname))

    zipf.close()


In [22]:
#function that evaluates a pptx file for any significant area that are hidden in the "trimmmed" section of a cropped image
def evaluator(ppt):
    prs = Presentation(ppt)
    percentages = []
    for x in prs.slides:

        for y in x.element.xpath('//a:srcRect'):
            l = y.l #distance from left edge as %
            b = y.b #distance from bottom edge as %
            r = y.r #distance from right edge as %
            t= y.t #distance from top edge as %
            if ((l+r)*(t+b)) != 0.0:
                percentages.append((l+r)*(t+b))
    return percentages


In [23]:
#uses goog.io api to search google and returns list of urls
def googer(term, typ):
    
    query = {
    "q": term,
    "hl": "en",
    "num": 70
    }
    url = f"https://api.goog.io/v1/search/" + urllib.parse.urlencode(query)
    resp = requests.get(url, headers=headers)
    results = resp.json()
    temp_urls = []
    for x in results['results']:
        temp_urls.append([x['link'], typ])
    #returns list of lists containing [url, file extension]
    return temp_urls




In [24]:
#handles appending search term with 'filetype:pptx'
def get_urls(searchterm):
    final_list = []
    url_list = []
    url_list.append(googer(searchterm + " filetype:pptx", 'pptx'))
    for url in [item for sublist in url_list for item in sublist]:
        try:   
            r = requests.get(url[0])
            sc = r.status_code
            if sc == 200:
                final_list.append(url)
        except:
            pass
    return final_list

In [25]:
#where documents are autodownloded, a file extension is added
def filenamer(url_link, doc_name):
    name = ''
    if url_link.endswith('pptx'):
        name = doc_name
    else:
        name = re.sub(r'[\\/*?:"<>|]',"", doc_name) + '.pptx'
    return name
    

In [26]:
#downloads pptx file to working directory
def save_link(url_link, doc_name):
    if url_link.endswith('pptx'):
        try:
            urllib.request.urlretrieve(url_link, doc_name)
        except:
            test = requests.get(url_link)
            with open(re.sub(r'[\\/*?:"<>|]',"", doc_name), 'wb') as f:
                f.write(test.content)
    else:
        test = requests.get(url_link)
        with open(re.sub(r'[\\/*?:"<>|]',"", doc_name) +'.pptx', 'wb') as f:
            f.write(test.content)

In [27]:
#where documents are autodownloded, a file extension is checked
def extension_check(doc, typ):
    name = ''
    if str(doc).endswith('pptx'):
        name = doc
    else:
        name = re.sub(r'[\\/*?:"<>|]',"", doc) + '.' + typ
    
    return name

In [28]:
def average(lst):
    try:
        return sum(lst) / len(lst)
    except:
        return 0.0 

In [29]:
#for each url, the document will have the images extracted and the size of the trimmed area measured
def prioritiser(urls):
  
    priorities = []
    
    for u in urls:
        #print(u[0])
        url = u[0] #url
        typ = u[1] #file extension
        
        if os.path.isdir('img_folder/' + tldextract.extract(u[0]).domain) is False:
            os.mkdir('img_folder/' + tldextract.extract(u[0]).domain)  
        try:
            save_link(url, url.split('/')[-1]) #saves pptx to folder

            #saves images into domain labeled folder
            process(filenamer(url, url.split('/')[-1]), 'img_folder/' + tldextract.extract(u[0]).domain + '/')
            pic_list = evaluator(filenamer(url, url.split('/')[-1]))
            
            if average(pic_list) != 0.0:
                priorities.append([tldextract.extract(u[0]).domain, url, pic_list, average(pic_list)])
        except Exception as e:
            print(e)
            pass
    
    return priorities

In [30]:
#function to allow for a single search term to be enetered
def pptshots(term):
    return prioritiser(get_urls(term))

# !!!!!!! Enter search term below - maybe check out your own domain with the site:domain.com dork

In [None]:
#this can take a while for all the urls to be processed so be patient
#prints a list of lists with the [[domain, url of the pptx, [list of images in the pptx], average % of image cropped]]
#any powerpoint that has a gretaer tham >20% area cropped is maybe worth a look
#browse to img_folder/domain and have a look for anything interetsing
pptshots('your search term or domain')

Package not found at 'filedownload.ashx?moduleinstanceid=25563&dataid=41911&FileName=Investigate_%20How%20Do%20I%20Search%20for%20Relevant%20Resources_.pptx'
file 'a-primer-on-searching-the-internet-v2021.ppsx.pptx' is not a PowerPoint file, content type is 'application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml'


In [None]:
#search for documents hosed on domains that have a hackerone bug bounty progamme 

'''
def bug_bounty_domains():
    jsn = requests.get('https://raw.githubusercontent.com/disclose/diodb/master/program-list.json').json()
    domainlist = []
    for j in jsn:
        parsed = tldextract.extract(j['policy_url'])
        domainlist.append('.'.join([parsed.domain, parsed.suffix]))
    return domainlist

bug_list = bug_bounty_domains()
random.shuffle(bug_list)

for bug in bug_list:
    try:
        for l in pptshots('site:' + bug):
            print(l)
    except Exception as e:
        print(e)
        logging.error('HTTP ERROR ' + str(e))

'''


In [None]:
  
'''
def tld_getter():
    tld_list = []
    with requests.Session() as s:
        download = s.get('https://gist.githubusercontent.com/derlin/421d2bb55018a1538271227ff6b1299d/raw/3a131d47ca322a1d001f1f79333d924672194f36/country-codes-tlds.csv')

        decoded_content = download.content.decode('utf-8')

        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        
        for country in list(cr)[1:]:
            tld_list.append(country[1].lstrip())
    return tld_list

ctry_list = tld_getter()
random.shuffle(ctry_list)

for ctry in ctry_list:
    try:
        print(ctry + ' ' + str(pptshots('site:' + 'gov' + ctry)))
    except Exception as e:
        print(e)
        logging.error('HTTP ERROR ' + str(e))
'''


In [8]:

#search across fortune 500 domains
'''

def fortune_getter():
    f_list = []
    with requests.Session() as s:
        download = s.get('https://gist.githubusercontent.com/mbejda/45db05ea50e79bc42016/raw/52d5ca99398b495e096f6eace20f5872129633e3/Fortune-1000-Company-Twitter-Accounts.csv')

        decoded_content = download.content.decode('utf-8')

        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        
        for country in list(cr)[1:]:
            f_list.append(country[0].lstrip())
    return f_list

ftn_list = fortune_getter()
random.shuffle(ftn_list)

for ftn in ftn_list:
    try:
        print(ftn + ' ' + str(pptshots('site:' + ftn)))
    except Exception as e:
        print(e)
        logging.error('HTTP ERROR ' + str(e))
'''

## Notes

It is actually pretty rare to find anything interesting, after several days only one presentation contained 'sensitive' information. In this instance an "unnamed US federal government executive branch organization" had unintentionally left some PII in a Facebook screenshot. I reported this to them and the presentation is no longer publicly facing.

Other less sensitive information included browser tabs and OS information from the screen peripheries which could be of minor value to an attacker but nothing too exciting. Interestingly, on a few occasions where screenshots had been taken with dual monitors there was an entire extra screen to examine - I didn't identify anything more valuable than a half filled in timesheet but there is potential for sizeable data to have been exposed if a spreadsheet or similar had been open.

