# Downloading the Images

In [None]:
import os
import requests
import shutil
import pandas as pd
import time
import random as rd
import sys

In [None]:
def read_file(filepath, sheet):
    """read the excel file into a dataframe"""
    df = pd.read_excel(filepath, sheet_name=sheet)
    return df

In [None]:
def clean_df(df):
    """rename columns, drop faulty rows, reset the index to job_codes and sort the dataframe by index"""
    # clean up the column names
    df.columns = ['job_code', 'functional_area', 'job_title', 'job_description', 
                  'remark', 'start_date', 'expiry_date']
    
    #dropping faulty job_codes
    index_names = df[df['job_code'].map(type) != int].index
    df.drop(index_names, inplace=True)
    
    #reset the index to the job code
    df.set_index('job_code', inplace=True)
    
    #sort by job codes
    df.sort_index(axis=0, inplace=True)
    
    return df

In [None]:
def find_between(s, first, last):
    """extract the string between two patterns"""
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""

In [None]:
def gen_url(string):
    """generate the image url from instructions"""
    
    if '%2F' in string:
        part1 = string.split('%2F')[0]
        part2 = string.split('%2F')[1]
        
        return "http://topjobs.lk/logo/{}/{}".format(part1, part2)
    else:
        return string

In [None]:
def create_url_col(df):
    """extract the string containing the image and generate the image"""
    
    #extract the image text
    df['image_string'] = df['remark'].apply(lambda x: find_between(str(x), "Flogo%2F", "%22"))
    
    # generate the relevant url
    df['image_url'] = df['image_string'].apply(lambda x: gen_url(x))

In [None]:
def get_images(df, start=0, end):
    """takes the dataframe as the input, uses the url column to download the image, and saves it locally"""
    
    filepath = #define filepath here
    
    os_error = open(f"{filepath}/os_error_codes.txt","a+") #change file name
    
    no_url = open(f"{filepath}/no_url_codes.txt","a+") #change file name
    
    for job_code in df.index[start:end]:
        if '%2F' in df.loc[job_code, 'image_string']: # a url won't be created in the absence of this pattern
            if '+' not in df.loc[job_code, 'image_url']: # '+' encoded images generated os type error
                response = requests.get(df.loc[job_code, 'image_url'], stream=True)
                image_file = open("{}/{}.{}".format(filepath, str(job_code), df.loc[job_code, 'image_url'].split('.')[-1]), 'wb')
                response.raw.decode_content = True
                shutil.copyfileobj(response.raw, image_file)
                image_file.close()
            else: 
                os_error.write(str(job_code) + "\n")
        else:
            no_url.write(str(job_code) + "\n")
            
        #sleep between requests
        time.sleep(rd.uniform(0.2, 0.4))
        
    os_error.close()
    no_url.close()

In [None]:
def main():
    location = "" #sys.argv[1]
    sheet = "" #sys.argv[2]
    start = #sys.argv[3]
    end = #sys.argv[4]
    
    # read the file
    df_tj = read_file(location, sheet)
    
    # clean the dataframe
    df_tj = clean_df(df_tj)
    
    # generate the urls
    create_url_col(df_tj)
    
    # download the images
    get_images(df_tj)
    
    

In [None]:
if __name__ == '__main__':
    main()


---

# Interruption Handling

ab = list(dft.index)

ab.index(660266)