In [15]:
import os
import requests
import shutil
import pandas as pd
import time
import random as rd

In [16]:
def read_file(filepath, sheet):
    """read the excel file into a dataframe"""
    df = pd.read_excel(filepath, sheet_name=sheet)
    return df

In [17]:
def clean_df(df):
    """rename columns, drop faulty rows and reset the index to job_codes"""
    # clean up the column names
    df.columns = ['job_code', 'functional_area', 'job_title', 'job_description', 
                  'remark', 'start_date', 'expiry_date']
    
    #dropping faulty job_codes
    index_names = df[df['job_code'].map(type) != int].index
    df.drop(index_names, inplace=True)
    
    #reset the index to the job code
    df.set_index('job_code', inplace=True)
    
    #sort by job codes
    df.sort_index(axis=0, inplace=True)
    
    return df

In [18]:
def find_between(s, first, last):
    """extract the string between two patterns"""
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""

In [19]:
def create_url(df):
    """extract the string containing the image and generate the image"""
    
    #extract the image text
    df['image_string'] = df['remark'].apply(lambda x: find_between(str(x), "Flogo%2F", "%22"))
    
    # generate the relevant url
    df['image_url'] = df['image_string'].apply(lambda x:"http://topjobs.lk/logo/{}/{}".format(str(x).split('%2F')[0], str(x).split('%2F')[1]) if '%2F' in x else x)

In [20]:
def get_images(df):
    """takes the dataframe as the input, uses the url column to download the image, and saves it locally"""
    
    file1 = open("E:/future_of_work/sample_images_8th_sep/os_error_codes_8th.txt","a+")
    
    file2 = open("E:/future_of_work/sample_images_8th_sep/no_url_codes_8th.txt","a+")
    
    for job_code in df.index[110000:]:
        if '%2F' in df.loc[job_code, 'image_string']:
            if '+' not in df.loc[job_code, 'image_url']:
                response = requests.get(df.loc[job_code, 'image_url'], stream=True)
                image_file = open("E:/future_of_work/sample_images_8th_sep/{}.{}".format(str(job_code), df.loc[job_code, 'image_url'].split('.')[-1]), 'wb')
                response.raw.decode_content = True
                shutil.copyfileobj(response.raw, image_file)
                image_file.close()
            else: 
                file1.write(str(job_code) + "\n")
        else:
            file2.write(str(job_code) + "\n")
            
        time.sleep(rd.uniform(0.2, 0.4))
        
    file1.close()
    file2.close()

In [13]:
def main():
    location = "E:/future_of_work/data/Data2018JanTo2019June.xlsx"
    sheet1 = "Data2018JanTo2019June"
    
    # read the file
    df_tj = read_file(location, sheet1)
    
    # clean the dataframe
    df_tj = clean_df(df_tj)
    
    # generate the urls
    create_url(df_tj)
    
    # download the images
    #get_images(df_tj)
    
    

In [29]:
if __name__ == '__main__':
    main()


First interruption

index (74802)

job_code (567457)

---

In [9]:
# ab = list(df_tj.index)
# ab.index(592591)
# ab.index(691762)
# dft.index[dft.index==691762]

### 404 errors - Images less than 5kb in size

In [1]:
import os
from os.path import isfile, join
from pathlib import Path

In [7]:
mypath = "E:/future_of_work/sample_images_8th_sep"
all_files = [f for f in os.listdir(mypath) if isfile(join(mypath, f))][:-2]
image_jobcodes = [int(f.split('.')[0]) for f in all_files]

In [8]:
len(image_jobcodes)

73481

In [10]:
error404_jobcodes = []
error404_codes = open("E:/future_of_work/sample_images_8th_sep/error404_codes_8th.txt","a+")

for f in all_files:
    filesize = os.path.getsize(f"{mypath}/{f}")/1024
    if filesize < 5:
        error404_jobcodes.append(int(f.split('.')[0]))
        error404_codes.write(str(f.split('.')[0]) + "\n")
        os.remove((f"{mypath}/{f}"))
        
error404_codes.close()

In [11]:
len(error404_jobcodes)

58311

In [21]:
location = "E:/future_of_work/data/Data2018JanTo2019June.xlsx"
sheet1 = "Data2018JanTo2019June"

df_ec = read_file(location, sheet1)
df_ec = clean_df(df_ec)
create_url(df_ec)

In [28]:
# sum = 0
# for index in error404_jobcodes[-20:]:
#     if '%' not in df_ec.loc[index, "image_url"]:
#         #sum += 1
#         print(df_ec.loc[index, "image_url"])

### Check whether Images are corrupted or not

In [29]:
from PIL import Image


In [48]:
mypath = "E:/future_of_work/sample_images_8th_sep"
all_files = [f for f in os.listdir(mypath) if isfile(join(mypath, f))][:-3]
#image_jobcodes = [int(f.split('.')[0]) for f in all_files]



In [49]:
broken_images = []
for file in all_files:
    try:
        v_image = Image.open(f"E:/future_of_work/sample_images_8th_sep/{file}")
        v_image.verify()
    except:
        broken_images.append(file)
    finally:
        v_image.close()

In [53]:
print(broken_images[:5])

['482784.png', '482857.jpg', '492175.jpg', '492184.jpg', '492189.PNG']


In [54]:
for image_number in broken_images:
    job_code = int(image_number.split('.')[0])
    response = requests.get(df_ec.loc[job_code, 'image_url'], stream=True)
    image_file = open("E:/future_of_work/sample_images_8th_sep/broken/{}.{}".format(str(job_code), df_ec.loc[job_code, 'image_url'].split('.')[-1]), 'wb')
    response.raw.decode_content = True
    shutil.copyfileobj(response.raw, image_file)
    image_file.close()
    time.sleep(rd.uniform(0.8, 1.2))

In [75]:
#df_ec.loc[job_code, 'image_url']

### 404 errors (repeat after previous step) - Images less than 5kb in size

In [76]:
mypath = "E:/future_of_work/sample_images_8th_sep/broken"
all_files = [f for f in os.listdir(mypath) if isfile(join(mypath, f))][:-31]
image_jobcodes = [int(f.split('.')[0]) for f in all_files]

In [77]:
len(image_jobcodes)

349

In [78]:
error404_jobcodes_1 = []
error404_codes_1 = open("E:/future_of_work/sample_images_8th_sep/error404_codes_1_8th.txt","a+")

for f in all_files:
    error404_jobcodes_1.append(int(f.split('.')[0]))
    error404_codes_1.write(str(f.split('.')[0]) + "\n")
    os.remove((f"{mypath}/{f}"))
    os.remove((f"E:/future_of_work/sample_images_8th_sep/{f}"))
        
error404_codes_1.close()

###  Recording missing files in excel


##### Loading files

In [80]:
os_errors = open("E:/future_of_work/sample_images_8th_sep/os_error_codes_8th.txt","r")
no_url = open("E:/future_of_work/sample_images_8th_sep/no_url_codes_8th.txt", "r")
error_404 = open("E:/future_of_work/sample_images_8th_sep/error404_codes_8th.txt", "r")

In [81]:
list_os_errors = []
list_no_url = []
list_error_404 = []

for line in os_errors:
    list_os_errors.append(int(line.strip()))
    
for line in no_url:
    list_no_url.append(int(line.strip()))
    
for line in error_404:
    list_error_404.append(int(line.strip()))
    
os_errors.close()
no_url.close()
error_404.close()



In [82]:
print(len(list_os_errors))
print(len(list_no_url))
print(len(list_error_404))

70467
21771
58661


In [83]:
print(len(list_os_errors) + len(list_no_url) + len(list_error_404))

150899


### DEFZZZ pattern request topjobs

In [109]:
merged_list = sorted(list_os_errors + list_no_url + list_error_404)

In [108]:
len(merged_list)

150899

In [110]:
pf = df_ec[df_ec.index.isin(merged_list)]

In [111]:
pf.shape

(150899, 8)

In [112]:
pf.tail()

Unnamed: 0_level_0,functional_area,job_title,job_description,remark,start_date,expiry_date,image_string,image_url
job_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
36569,Eng-Mech/Auto/Elec,Executive+-+Engineering+%28Mechanical%29+,%C2%B7+++++++++4-6+Years+of+with+adequate+know...,%3Cp%3E%3Cstrong%3EPerfetti+Van+Melle+Lanka+%2...,2019-02-12 00:00:00.000,2019-02-22 00:00:00.000,,
68518,Eng-Mech/Auto/Elec,Post+of++Technician+-+Electrical%2FMechanical+++,+Age+below+35+%28Male%29%0D%0A+Successful+comp...,%3Cp+style%3D%22text-align%3Acenter%22%3E%3Cst...,2018-06-05 00:00:00.000,2018-06-15 00:00:00.000,,


In [105]:
'DEFZZZ' in list(pf['image_string'])[0]

True

In [121]:
dfzz = pf[pf['image_string'].str.contains("DEFZZZ")]

In [122]:
dfzz.shape

(66866, 8)

In [123]:
dfzz.head(2)

Unnamed: 0_level_0,functional_area,job_title,job_description,remark,start_date,expiry_date,image_string,image_url
job_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
489913,IT-Sware/DB/QA/Web/Graphics/GIS,Business+Intelligence+%28BI%29+Consultant+-+Ba...,Please+refer+the+advertisement+,%3Cp+style%3D%22text-align%3Acenter%22%3E%3Cim...,2018-01-03 00:00:00.000,2018-01-08 00:00:00.000,DEFZZZ%2F4603csys.jpg,http://topjobs.lk/logo/DEFZZZ/4603csys.jpg
490576,Accounting/Auditing/Finance,Internship+-+Finance*First+Capital+Holdings+PLC,Please+refer+the+advertisement+,%3Cp+style%3D%22text-align%3A+center%3B%22%3E%...,2018-01-02 00:00:00.000,2018-01-03 00:00:00.000,DEFZZZ%2F8092cDunimas.jpg,http://topjobs.lk/logo/DEFZZZ/8092cDunimas.jpg


In [115]:
dfzz.shape

(66866, 8)

In [124]:
dfzz = dfzz[['image_string']]

In [125]:
dfzz.shape

(66866, 1)

In [126]:
pwd()

'E:\\future_of_work\\code\\eda'

In [127]:
dfzz.to_excel("defzzz_type_errors.xlsx")