# HOW TO DOWNLOAD FILES AUTOMATICALLY

#### We can  download files using "Requests" module. The "get" method of the requests module is used to download the file contents in binary format. We can then use the "open" method to open a file on our system.

In [19]:
import requests
import pandas as pd
import numpy as np

In [17]:
url="http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_C_000_2920Z_SY.pdf"
r = requests.get(url, stream=True)
with open('C:/Users/EL MAHDI/Desktop/blabla/18-19/F3/projet ISA/NAP_TEST3.pdf', 'wb') as f:
    f.write(r.content)


#### In the above script, the open method is used once again to write binary data to local file. If we execute the above script and go to the specified path directory, we see our newly downloaded PDF file named "NAP_TEST3.pdf"

# Download and save multiple pdf files, according to a list of urls, using a loop

## URL's table

In [3]:
urls_table = pd.read_excel("Liste-des-codes-APE-NAF-excel-2017.xlsx")

In [4]:
urls_table.head()

Unnamed: 0,Code APE,partie 2 url,nbcar,partie 4 url,Intitulés de la NAF,url,partie 1 url,partie 3,partie 5
0,SECTION A,A,9,,"AGRICULTURE, SYLVICULTURE ET PÊCHE",,,,
1,01.11Z,A,6,0111Z,"Culture de céréales (à l'exception du riz), de...",http://www.risquesprofessionnels.ameli.fr/file...,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf
2,01.12Z,A,6,0112Z,Culture du riz,http://www.risquesprofessionnels.ameli.fr/file...,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf
3,01.13Z,A,6,0113Z,"Culture de légumes, de melons, de racines et d...",http://www.risquesprofessionnels.ameli.fr/file...,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf
4,01.14Z,A,6,0114Z,Culture de la canne à sucre,http://www.risquesprofessionnels.ameli.fr/file...,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf


### Any missing values

In [5]:
urls_table[urls_table.isnull().any(axis=1)].head()

Unnamed: 0,Code APE,partie 2 url,nbcar,partie 4 url,Intitulés de la NAF,url,partie 1 url,partie 3,partie 5
0,SECTION A,A,9,,"AGRICULTURE, SYLVICULTURE ET PÊCHE",,,,
40,SECTION B,B,9,,INDUSTRIES EXTRACTIVES,,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf
56,SECTION C,C,9,,INDUSTRIE MANUFACTURIÈRE,,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf
316,SECTION D,D,9,,"PRODUCTION ET DISTRIBUTION D'ÉLECTRICITÉ, DE G...",,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf
325,SECTION E,E,9,,PRODUCTION ET DISTRIBUTION D'EAU ; ASSAINISSEM...,,http://www.risquesprofessionnels.ameli.fr/file...,_000_,_SY.pdf


### Drop any row containing a NaN value

In [6]:
urls_table=urls_table.dropna(axis=0, how='any')

In [7]:
urls_table['url'].count()

732

In [8]:
urls_table.columns

Index(['Code APE', 'partie 2 url', 'nbcar', 'partie 4 url',
       ' Intitulés de la NAF', 'url', 'partie 1 url', 'partie 3', 'partie 5'],
      dtype='object')

In [9]:
pd.set_option('display.max_colwidth', -1) ### display all the column s content

## drop some columns useless columns
urls_table = urls_table.drop([' Intitulés de la NAF', 'partie 1 url'], axis=1)
urls_table.head()

Unnamed: 0,Code APE,partie 2 url,nbcar,partie 4 url,url,partie 3,partie 5
1,01.11Z,A,6,0111Z,http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0111Z_SY.pdf,_000_,_SY.pdf
2,01.12Z,A,6,0112Z,http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0112Z_SY.pdf,_000_,_SY.pdf
3,01.13Z,A,6,0113Z,http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0113Z_SY.pdf,_000_,_SY.pdf
4,01.14Z,A,6,0114Z,http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0114Z_SY.pdf,_000_,_SY.pdf
5,01.15Z,A,6,0115Z,http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0115Z_SY.pdf,_000_,_SY.pdf


### test on the 4th urls

In [10]:
urls_table['url'][0:4]

1    http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0111Z_SY.pdf
2    http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0112Z_SY.pdf
3    http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0113Z_SY.pdf
4    http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques/NAFAPE_2016_A_000_0114Z_SY.pdf
Name: url, dtype: object

## create a loop to download and save the files 

### 1- split the url to a list of two elements

In [11]:
url_test = urls_table['url'][0:4][1]
url_test.rsplit('/', 1)   ## 1=do one split

['http://www.risquesprofessionnels.ameli.fr/fileadmin/fiches_statistiques',
 'NAFAPE_2016_A_000_0111Z_SY.pdf']

### 2- now we extract the pdf's name from the splitted list

In [12]:
url2 = urls_table['url'][0:10] ## test on the first 10 urls
for url5 in url2:
    if url5.find('/'):
        print(url5.rsplit('/', 1)[1])

NAFAPE_2016_A_000_0111Z_SY.pdf
NAFAPE_2016_A_000_0112Z_SY.pdf
NAFAPE_2016_A_000_0113Z_SY.pdf
NAFAPE_2016_A_000_0114Z_SY.pdf
NAFAPE_2016_A_000_0115Z_SY.pdf
NAFAPE_2016_A_000_0116Z_SY.pdf
NAFAPE_2016_A_000_0119Z_SY.pdf
NAFAPE_2016_A_000_0121Z_SY.pdf
NAFAPE_2016_A_000_0122Z_SY.pdf
NAFAPE_2016_A_000_0123Z_SY.pdf


### 3- at this  point we can automatically download and save pdfs with the correct name

In [13]:
#from multiprocessing.pool import ThreadPool
from time import time as timer

#### without using a multiprocessing ibrary

In [14]:
#def down_save(entry):
    #for url in entry:
    ## SEND REQUEST OF DOWNLOADING
        #r = requests.get(url, stream=True)
    ## SAVE AND WRITE THE CONTENT IN TNE CORRECT DIRECTORY WITH CORRECT FILE NAME
        #with open( 'NAP_FILES/'+ url.rsplit('/', 1)[1] , 'wb') as f:    
          #  f.write(r.content)

In [15]:
def down_save2(link):
    r = requests.get(link, stream=True)
    ## SAVE AND WRITE THE CONTENT IN TNE CORRECT DIRECTORY WITH CORRECT FILE NAME
    with open( 'NAP_FILES/new'+ link.rsplit('/', 1)[1] , 'wb') as f:
        f.write(r.content)

In [16]:
url732=urls_table['url'] ### all pdf files
start = timer()
for url in url732:
    down_save2(url)
    
print(f"Elapsed Time: {timer() - start}")

Elapsed Time: 146.16459393501282


# CONVERT PDF TO CSV 

In [2]:
import glob

In [13]:
## TAKE ALL PDF FROM A DIRECTORY
pdf_list = glob.glob('NAP_FILES/*.pdf')

## FUNTION TO STORE PDFS IN A LIST
def find_pdfs(directory):
    files=[]
    for i in range(0, len(directory)):
        f= directory[i].rsplit('\\', 1)
        files.append(f[1])
    return files

print("Number of pdfs in depository: %s" % len(find_pdfs(pdf_list)))

print(find_pdfs(pdf_list)[0])
print(find_pdfs(pdf_list)[0][0:26])

Number of pdfs in depository: 732
NAFAPE_2016_A_000_0111Z_SY.pdf
NAFAPE_2016_A_000_0111Z_SY


In [None]:
def pasre_pdf(alist):
    core_relevant_df_collection={}
    #head_relevant_df_collection={}
    for i in range(0, len(alist)):
        table = camelot.read_pdf('NAP_FILES/'+alist[i], flavor='stream')
        table[0].to_csv('NAP_FILES_CSV/new/'+alist[i][0:26]+'.csv')
        file = pd.read_csv('NAP_FILES_CSV/new/'+alist[i][0:26]+'.csv', encoding="utf-8")
        core_relevant_df = file[14:95].dropna(thresh=len(file[14:95]) - 16, axis=1)
        #head_relevant_df = file[2:8].dropna(thresh=len(file[2:8]) - 2, axis=1)
        core_relevant_df.columns=["index1","nb_AT_1er_regle1","nb_new_IP1","nb_death1","nb_lost_days1","index2","nb_AT_1er_regle2","nb_new_IP2","nb_death2","nb_lost_days2"]
        #head_relevant_df.columns=["index1","stats1","index2","stats2"]
        core_relevant_df_collection[alist[i][18:23]]=core_relevant_df
        #head_relevant_df_collection[alist[i][18:23]]=head_relevant_df
    return core_relevant_df_collection #, head_relevant_df_collectionf

pasre_pdf(find_pdfs(pdf_list))    

In [None]:
tables2=camelot.read_pdf('NAP_FILES/'+find_pdfs(pdf_list)[2], flavor='stream')

tables2[0].to_csv(find_pdfs(pdf_list)[2][0:26]+'.csv')
test_file1 = pd.read_csv(find_pdfs(pdf_list)[2][0:26]+'.csv', encoding="utf-8")
core_relevant_df2 = test_file1[14:95]
head_relevant_df2 = test_file1[2:8]
core_relevant_df2 = core_relevant_df2.dropna(thresh=len(core_relevant_df2) - 16, axis=1)
head_relevant_df2 = head_relevant_df2.dropna(thresh=len(head_relevant_df2) - 2, axis=1)
#core_relevant_df2 = core_relevant_df2.drop(["Unnamed: 1","Unnamed: 5","Unnamed: 8","Unnamed: 12"], axis=1)
#head_relevant_df2 = head_relevant_df2.drop(["Unnamed: 1","Unnamed: 2","Unnamed: 3","NAF niveau 1 : Agriculture, sylviculture et pêche","Unnamed: 5","Unnamed: 8","Unnamed: 9","Unnamed: 10","Unnamed: 11","Unnamed: 12"], axis=1)
core_relevant_df2.columns=["index1","nb_AT_1er_regle1","nb_new_IP1","nb_death1","nb_lost_days1","index2","nb_AT_1er_regle2","nb_new_IP2","nb_death2","nb_lost_days2"]
head_relevant_df2.columns=["index1","stats1","index2","stats2"]
#core_relevant_df2
#head_relevant_df2