# docshots

Cropping pictures inserted in a microsoft word document enables users to hide parts of a picture that they do not want to display. The problem is that Office’s cropping tool only modifies how the cropped image is displayed in the body of the document. The original picture remains intact. Cropped portions of the image are not completely removed from the document and can be seen by others if the file is uploaded to the internet. Data leakage can occur if there is sensitive data in the trimmed areas.

Docshots searches google for documents (docx) by query, downloads them and checks for images where cropping has occured.

This solution uses goog.io, They have free and commercial packages available.

It is advised that you run the notebook in a sandbox or vm as it does involve downloading documents unchecked from the internet.

In [18]:
import docx
import docx2txt
import requests
import googlesearch
import logging
import urllib.request
import xml.dom.minidom                                        
from collections import OrderedDict
from time import sleep
import random
import tldextract
import urllib
import wget
from dotenv import load_dotenv
import datetime
import os


In [19]:
load_dotenv() #loads variables from .env

True

In [20]:
key = os.getenv('GOOG') #goog.io api key

In [21]:


# set the api key in headers
headers = {
    "apikey": key
}



In [22]:
def googer(term, typ):
    #uses goog.io api to search google
    query = {
    "q": term,
    "hl": "en",
    "num": 70
    }
    url = f"https://api.goog.io/v1/search/" + urllib.parse.urlencode(query)
    resp = requests.get(url, headers=headers)
    results = resp.json()
    temp_urls = []
    for x in results['results']:
        temp_urls.append([x['link'], typ])
    #returns list of lists containing [url, file extension]
    return temp_urls

In [23]:
def get_urls(searchterm):
    final_list = []
    url_list = []
    
    #can add other extensions here
    #url_list.append(googer(searchterm + " filetype:docm", 'docm'))
    #url_list.append(googer(searchterm + " filetype:doc", 'doc'))
    url_list.append(googer(searchterm + " filetype:docx", 'docx'))
    #print(url_list)
    for url in [item for sublist in url_list for item in sublist]:
        try:   
            r = requests.get(url[0])
            sc = r.status_code
            if sc == 200:
                final_list.append(url)
        except:
            pass
        

    #print(searchterm + ' number of docs = ' + str(len(final_list)))
    return final_list
        

In [24]:
def filenamer(url_link, doc_name):
    #where documents are autodownloded, a file extension is added
    name = ''
    if url_link.endswith(('docm', 'docx')):
        name = doc_name
    else:
        name = doc_name + '.docx'
    return name
    

In [25]:
def save_link(url_link, doc_name):
    #downloads file
    if url_link.endswith(('docm', 'docx')):
        urllib.request.urlretrieve(url_link, doc_name)
    else:
        wget.download(url_link, doc_name + '.docx')

In [26]:
def extension_check(doc, typ):
    #where documents are autodownloded, a file extension is checked
    name = ''
    if str(doc).endswith(('.docx', '.docm')):
        name = doc
    else:
        name = doc + '.' + typ
    return name

In [27]:
def prioritiser(urls):
    #for each url, the document will have the images extracted and the size of the trimmed area measured
    #error prone as some formats aren't processed  
    priorities = {}
    for u in urls:
        
        url = u[0] #url
        typ = u[1] #file extension
        
        if os.path.isdir('img_folder/' + tldextract.extract(u[0]).domain) is False:
            os.mkdir('img_folder/' + tldextract.extract(u[0]).domain)

            
            
        
            save_link(url, url.split('/')[-1]) #saves docx to folder
            
        
        try:
            #saves images into domain labeled folder
            docx2txt.process(filenamer(url, url.split('/')[-1]), 'img_folder/' + tldextract.extract(u[0]).domain + '/')



            # Open docx document
            doc = docx.Document(url.split('/')[-1])


            # Save all 'rId:filenames' relationships in an dictionary named rels
            rels = {}

            for r in doc.part.rels.values():

                if isinstance(r._target, docx.parts.image.ImagePart):
                    #print(docx.parts.image.ImagePart)
                    rels[r.rId] = os.path.basename(r._target.partname)


            # Then process your text
            for paragraph in doc.paragraphs:
                # If you find an image
                if 'Graphic' in paragraph._p.xml:
                    # Get the rId of the image
                    for rId in rels:
                        if rId in paragraph._p.xml:
                            #print(paragraph._p.xml)

                            dom = xml.dom.minidom.parseString(paragraph._p.xml)

                            if dom.getElementsByTagName("a:srcRect") != []:
                                #print('srcRect exists')
                                nodes = dom.getElementsByTagName("a:srcRect")
                                #print(paragraph._p.xml)

                                for node in nodes:

                                    shape_dict = dict(node.attributes.items())

                                    try:
                                        width_change = float(shape_dict['r'])/100000 + float(shape_dict['l'])/100000
                                        height_change = float(shape_dict['t'])/100000 + float(shape_dict['b'])/100000
                                        pct_change = (width_change*height_change) * float(100)

                                        priorities['img_folder/' + tldextract.extract(u[0]).domain + '/' + rels[rId]] = int(pct_change)
                                    except:
                                        pass

                            
                else:
                    continue
                    
        except Exception as e:
            print(e)
            logging.error('OTHER ERROR ' + str(e))
            


    

    # returns ordered dictionary of images and a % cropped area
    return OrderedDict(sorted(priorities.items(), key=lambda t: t[0]))

In [28]:
def docshots(term):
    return prioritiser(get_urls(term))

In [None]:
#search for documents hosed on domains that have a hackerone bug bounty progamme 

'''
def bug_bounty_domains():
    jsn = requests.get('https://raw.githubusercontent.com/disclose/diodb/master/program-list.json').json()
    domainlist = []
    for j in jsn:
        parsed = tldextract.extract(j['policy_url'])
        domainlist.append('.'.join([parsed.domain, parsed.suffix]))
    return domainlist

bug_list = bug_bounty_domains()
random.shuffle(bug_list)

for bug in bug_list:
    try:
        print(bug + ' ' + str(docshots('site:' + bug)))
    except Exception as e:
        print(e)
        logging.error('HTTP ERROR ' + str(e))
'''



In [None]:
#search for documents hosed on .gov.[country] domains  

'''
def tld_getter():
    tld_list = []
    with requests.Session() as s:
        download = s.get('https://gist.githubusercontent.com/derlin/421d2bb55018a1538271227ff6b1299d/raw/3a131d47ca322a1d001f1f79333d924672194f36/country-codes-tlds.csv')

        decoded_content = download.content.decode('utf-8')

        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        
        for country in list(cr)[1:]:
            tld_list.append(country[1].lstrip())
    return tld_list

ctry_list = tld_getter()
random.shuffle(ctry_list)

for ctry in ctry_list:
    try:
        print(ctry + ' ' + str(docshots('site:' + 'gov' + ctry)))
    except Exception as e:
        print(e)
        logging.error('HTTP ERROR ' + str(e))
'''


In [29]:
#example
#scrapes documents from iresource.gov.sb subdomain
#downloads SoundWaves_MS_all_pages.docx to the pwd
#extracts images to img_folder/iresource
#prints ordered dictionary seen below
#iresource/image1.png is 40% the size of the original image in the SoundWaves_MS_all_pages.docx

print(docshots('site:' + 'iresource.gov.sb'))


OrderedDict([('img_folder/iresource/image1.png', 40), ('img_folder/iresource/image3.png', 3), ('img_folder/iresource/image4.png', 3), ('img_folder/iresource/image5.png', 3)])


In [None]:
#try with your own domain

print(docshots('site:' + 'yourdomain.tld'))