### 1 - Scraping receipts from PemPem website and saving them as images

#### Add pempem username and password

In [30]:
import requests
from lxml import html
import urllib.request
import os.path
from bs4 import BeautifulSoup
from datetime import datetime

In [31]:
def scraping(ID):
    '''login and scrape text from pempem website for given ID'''
    # login
    session_requests = requests.session()
    payload = {'username': '', 'password': ''} # USERNAME and PASSWORD
    login_url = 'https://backend.pempemapp.org/login'
    result = session_requests.get(login_url)
    tree = html.fromstring(result.text)
    authenticity_token = list(set(tree.xpath("//input[@name='_token']/@value")))[0]
    payload['_token'] = authenticity_token
    result = session_requests.post(login_url, data = payload, headers = dict(referer=login_url))
    
    # scraping
    url = 'https://backend.pempemapp.org/receipt/' + ID
    res = session_requests.get(url, headers = dict(referer = url))
    if 'ErrorException' in res.text:
        return None
    else:
        return res.text

In [32]:
def parsing(scraped_text):
    '''extract relevant information from scraped text as dictionary including receipt type'''
    dic = {}
    parsed_html = BeautifulSoup(scraped_text)
    table = parsed_html.find('table',attrs={'class':'table table-striped'})
    try:
        trs = table.find_all('tr')
        for tr in trs:
            th = tr.find('th').text
            td = tr.find('td')
            if th == 'Price Receipt' or th == 'Weight Receipt':
                link = td.find('a')
                if link:
                    dic['receipt_type'] = th
                    dic[th] = link['href']
            else:
                dic[th] = td.text
    except:
        pass
    return dic

In [33]:
def save_image(dic):
    '''save image from pempem website in folder Images/weight_receipt or Images/price_receipt
    file name format: ID_middleManName_millName_dateUpload_receiptType.jpg'''
    receipt_type = dic['receipt_type'] # 'Price Receipt' or 'Weight Receipt'
    image_url = dic[receipt_type]
    ID = dic['Id'][:-1]
    middle_man_name = dic['User Name'].replace('/','').replace(' ','').lower()
    mill_name = dic['Mill Name'].replace('/','').replace(' ','').lower()
    if mill_name=='':
        mill_name = 'no_mill_name'
    date = datetime.strptime(dic['Created At'], '%d %b %Y %I:%M %p').strftime('%y%m%d_%H%M')
    receipt_type_name = receipt_type.replace(' ','_').lower()
    if not os.path.exists('../Images'):
        os.mkdir('../Images')
    receipt_type_folder = '../Images/'+receipt_type_name
    if not os.path.exists(receipt_type_folder):
        os.mkdir(receipt_type_folder)
    folder_name = receipt_type_folder+'/'+mill_name
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)
    file_name = folder_name+'/'+ID+'_'+middle_man_name+'_'+mill_name+'_'+date+'_'+receipt_type_name+'.jpg'
    if not os.path.exists(file_name):
        urllib.request.urlretrieve(image_url, file_name)
        print('Saved new image file for ID ' + ID+'/mill '+mill_name)
    else:
        print('Image for ID '+ID+'/mill '+mill_name+' is already saved')

In [34]:
def run(startId, lastId):
    '''run scraping and save images in folders for all IDs between startId and lastId'''
    for ID in range(startId, lastId+1):
        print('Processing ID', ID)
        scraped_text = scraping(str(ID))
        if scraped_text:
            dic = parsing(scraped_text)
            if dic:
                save_image(dic)
            else:
                print('ID '+str(ID)+' not found')
        else:
            print('ID '+str(ID)+' not found')

In [1]:
startId = 0
lastId = 5732
run(startId, lastId)