In [212]:
# IMPORTS
import pandas as pd
import textract
import glob
from collections import Counter
# To parse pdfs
import io
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

### Read first CSV and append the rest to it to form the general training dataset

In [213]:
total_df = pd.read_csv("data/train1.csv")
df2 = pd.read_csv("data/train2.csv", sep = ";")
print((total_df.columns == df2.columns).all())
total_df = pd.concat((total_df, df2), ignore_index = True)
print(total_df.shape)
total_df.head()

True
(37127, 21)


Unnamed: 0,countryName,eprtrSectorName,EPRTRAnnexIMainActivityLabel,FacilityInspireID,facilityName,City,targetRelease,pollutant,reportingYear,MONTH,...,CONTINENT,max_wind_speed,avg_wind_speed,min_wind_speed,max_temp,avg_temp,min_temp,DAY WITH FOGS,REPORTER NAME,CITY ID
0,Germany,Mineral industry,Installations for the production of cement cli...,https://registry.gdi-de.org/id/de.ni.mu/062217...,Holcim (Deutschland) GmbH Werk Höver,Sehnde,AIR,Carbon dioxide (CO2),2015,10,...,EUROPE,15.118767,14.312541,21.419106,2.864895,4.924169,9.688206,2,Mr. Jacob Ortega,7cdb5e74adcb2ffaa21c1b61395a984f
1,Italy,Mineral industry,Installations for the production of cement cli...,IT.CAED/240602021.FACILITY,Stabilimento di Tavernola Bergamasca,TAVERNOLA BERGAMASCA,AIR,Nitrogen oxides (NOX),2018,9,...,EUROPE,19.66155,19.368166,21.756389,5.462839,7.864403,12.023521,1,Ashlee Serrano,cd1dbabbdba230b828c657a9b19a8963
2,Spain,Waste and wastewater management,Landfills (excluding landfills of inert waste ...,ES.CAED/001966000.FACILITY,COMPLEJO MEDIOAMBIENTAL DE ZURITA,PUERTO DEL ROSARIO,AIR,Methane (CH4),2019,2,...,EUROPE,12.729453,14.701985,17.10393,1.511201,4.233438,8.632193,2,Vincent Kemp,5011e3fa1436d15b34f1287f312fbada
3,Czechia,Energy sector,Thermal power stations and other combustion in...,CZ.MZP.U422/CZ34736841.FACILITY,Elektrárny Prunéřov,Kadaň,AIR,Nitrogen oxides (NOX),2012,8,...,EUROPE,11.856417,16.122584,17.537184,10.970301,10.298348,15.179215,0,Carol Gray,37a6d7a71c4f7c2469e4f01b70dd90c2
4,Finland,Waste and wastewater management,Urban waste-water treatment plants,http://paikkatiedot.fi/so/1002031/pf/Productio...,"TAMPEREEN VESI LIIKELAITOS, VIINIKANLAHDEN JÄT...",Tampere,AIR,Methane (CH4),2018,12,...,EUROPE,17.11193,20.201604,21.536012,11.772039,11.344078,16.039004,2,Blake Ford,471fe554e1c62d1b01cc8e4e5076c61a


### Download data from API and concat to the csv data

In [215]:
API_urls = "http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/"
API_urls = [API_urls + "first", API_urls + "second", API_urls + "third"]

for i, url in enumerate(API_urls):
    data = pd.read_json(url)
    print("API", i + 1 , "concatenaing", len(data), "rows")
    data.drop(columns = ["", "EPRTRAnnexIMainActivityCode", "EPRTRSectorCode"], inplace = True) # Drop redundant columns
    # Concat to total df
    total_df = pd.concat((total_df, data), ignore_index = True)

print("Final shape after concatenating APIs", total_df.shape)

API 1 concatenaing 9500 rows
API 2 concatenaing 9500 rows
API 3 concatenaing 9501 rows
Final shape after concatenating APIs (65628, 21)


In [216]:
# Save dataframe
total_df.to_csv("total_df.csv", index = False)

### Parse PDFs

In [217]:
def pdf_to_text(path):
    with open(path, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        outfp = io.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
    text = outfp.getvalue()
    return text

In [218]:
def select_meaningful_lines(text, print_raw=False):
    # Remove some specific lines that induce problems
    for i in range(len(text)):
        if text[i] == 'nº:': n_ind = i
    text = text[:n_ind-1]+text[n_ind+5:]

    if print_raw: print([(i, text[i]) for i in range(len(text))]) # Print each line with the corresponding index
    text = [text[i] for i in [2, 3, 8, 9, 13, 17, 19, 21, 26, 28, 32, 36, 40, 42, 46, 50, 54, 58, 60, 64, 66, 70]]
    return text

def clean_lines(text):
    text.append(text[15].split(" ")[0])
    text[12], text[17], text[15] = text[12].split(" ")[0], text[17].split(" ")[0], text[15].split(":")[1]
    text = [text[i].split(":")[-1].lstrip() for i in range(len(text))] # Eliminate the name of the field like "name:" if there is any
    return text

['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel',
       'FacilityInspireID', 'facilityName', 'City', 'targetRelease',
       'pollutant', 'reportingYear', 'MONTH', 'DAY', 'CONTINENT',
       'max_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_temp',
       'avg_temp', 'min_temp', 'DAY WITH FOGS', 'REPORTER NAME', 'CITY ID',
       'EPRTRAnnexIMainActivityCode', 'EPRTRSectorCode']

def create_df(fields):
    """ Add column names to each of the extracted fields"""
    # Field names in the order they are received
    field_names = ["facilityName", "FacilityInspireID", "countryName", "City", "CONTINENT", "EPRTRSectorCode", "eprtrSectorName",
            "EPRTRAnnexIMainActivityCode", "targetRelease", "emissions", "pollutant", "DAY", "MONTH", "reportingYear", "max_wind_speed",
            "avg_wind_speed", "max_temp", "min_temp", "avg_temp", "DAY WITH FOGS", "REPORTER NAME", "CITY ID", "min_wind_speed"]
    return pd.DataFrame(data=dict(zip(field_names,fields)), index = [0])

pdf_filenames = glob.glob("data/train6/*") # Read filenames

for i, pdf_filename in enumerate(pdf_filenames):
    text = pdf_to_text(pdf_filename) # Get the text from the pdf
    text = text.split("\n") 
    if len(text) != 79:
        continue
    fields = select_meaningful_lines(text)
    # Split some fields that have been merged in the decoding
    clean_fields = clean_lines(fields)
    df = create_df(clean_fields)
    #if i == 0: # Print process for the first pdf
    if i==0:
        pdf_dfs = df
        print("Select fields with relevant info")
        print(fields)
        print("Clean fields")
        print(clean_fields)
        print("Extraxted df")
    else:
        if df.loc[0, "countryName"] != "":
            pdf_dfs = pd.concat((pdf_dfs, df), ignore_index = True)

display(pdf_dfs)

print("From", len(pdf_filenames), "PDFs,", len(pdf_dfs), "could be parsed")
total_df = pd.concat((total_df, pdf_dfs), ignore_index = True) # Concat to general dataframe
print("Final shape after concatenating APIs", total_df.shape)

Select fields with relevant info
['FACILITY NAME: Greenoakhill Landfill Site, Glasgow', 'FacilityInspireID: UK.SEPA/200000110.Facility', 'United Kingdom', 'Mount Vernon', 'EUROPE', '5', 'eprtrSectorName: Waste and wastewater management', 'MainActivityCode: 5(d)', 'AIR', '1910000', 'Methane (CH4)', '14', '3', '2019', 'max_wind_speed: 1,44E+16', ' 1,47E+16', '1,61E+16', '1,77E+15', '1,61E+16', '7', 'REPORTER NAME:Jodi Holden', '5a9cc0e1663ad226675e57387c5e24e6', '1,7E+15']
Clean fields
['Greenoakhill Landfill Site, Glasgow', 'UK.SEPA/200000110.Facility', 'United Kingdom', 'Mount Vernon', 'EUROPE', '5', 'Waste and wastewater management', '5(d)', 'AIR', '1910000', 'Methane (CH4)', '14', '3', '2019', '1,44E+16', '1,47E+16', '1,61E+16', '1,77E+15', '1,61E+16', '7', 'Jodi Holden', '5a9cc0e1663ad226675e57387c5e24e6', '1,7E+15']
Extraxted df


Unnamed: 0,facilityName,FacilityInspireID,countryName,City,CONTINENT,EPRTRSectorCode,eprtrSectorName,EPRTRAnnexIMainActivityCode,targetRelease,emissions,...,reportingYear,max_wind_speed,avg_wind_speed,max_temp,min_temp,avg_temp,DAY WITH FOGS,REPORTER NAME,CITY ID,min_wind_speed
0,"Greenoakhill Landfill Site, Glasgow",UK.SEPA/200000110.Facility,United Kingdom,Mount Vernon,EUROPE,5,Waste and wastewater management,5(d),AIR,1910000,...,2019,"1,44E+16","1,47E+16","1,61E+16","1,77E+15","1,61E+16",7,Jodi Holden,5a9cc0e1663ad226675e57387c5e24e6,"1,7E+15"
1,"Auchencarroch Landfill, Jamestown",UK.SEPA/200000083.Facility,United Kingdom,Alexandria,EUROPE,5,Waste and wastewater management,5(d),AIR,1300000,...,2019,"1,69E+16","1,79E+15","5,38E+15","1,18E+15","7,29E+15",12,Tracy Powell,a6bc8f85a33112c5beea4d357caad4cd,"2,07E+16"
2,"Garlaff Landfill Site, Cumnock",UK.SEPA/200000084.Facility,United Kingdom,Cumnock,EUROPE,5,Waste and wastewater management,5(d),999000,AIR,...,2019,"1,49E+15","2,04E+16","1,58E+16","8,9E+15","4,32E+14",11,Rebecca Brooks,b6ca640b7121ae2bf206ea088f6a4618,"2,37E+16"
3,Auchenlosh Landfill Site,UK.SEPA/200000082.Facility,United Kingdom,Dalbeattie,EUROPE,5,Waste and wastewater management,5(d),101000,AIR,...,2019,"1,46E+16","1,47E+16","9,74E+15","1,1E+16","9,23E+15",2,Brian Sims,d52bec466f0edb76a6fc14c109598ba2,"1,7E+16"
4,"Auchinlea Landfill Site, Motherwell",UK.SEPA/200000109.Facility,United Kingdom,Clelland,EUROPE,5,Waste and wastewater management,5(d),922000,AIR,...,2019,"1,76E+15","1,72E+16","1,21E+16","1,66E+16","1,24E+16",14,Reginald Fisher,82403770c9ed79e39a23ca735c8d945e,"2,26E+15"
5,Nether Dallachy Landfill Site,UK.SEPA/200000106.Facility,United Kingdom,Spey Bay,EUROPE,5,Waste and wastewater management,5(d),945000,AIR,...,2019,"1,64E+16","1,82E+15","1,98E+14","2,04E+16","1,91E+16",7,Emily Martinez,a05c92d590027b51608394c16aedc477,"2,57E+16"
6,Alloa Glass Factory,UK.SEPA/200000073.Facility,United Kingdom,Alloa,EUROPE,3,Mineral industry,3(e),990000,AIR,...,2019,"1,74E+16","1,89E+16","5,82E+14","9,39E+15","6,09E+15",12,Lance Hart,2cc8f54182c37b8907f534011ea01e6f,"2,23E+15"
7,Dalmacoulter Landfill Site,UK.SEPA/200002314.Facility,United Kingdom,Dalmacouter,EUROPE,5,Waste and wastewater management,5(d),104000,AIR,...,2019,"9,89E+15","1,52E+16","1,25E+16","1,75E+16","1,48E+16",5,Keith Holmes,20b790f3c89efe9244f4b79071f3d7c5,"1,63E+16"
8,"Dalinlongart Landfill Site, Sandbank",UK.SEPA/200001325.Facility,United Kingdom,Dunoon,EUROPE,5,Waste and wastewater management,5(d),148000,AIR,...,2019,"1,58E+16","1,89E+15","1,04E+16","1,7E+16","1,23E+16",2,Joshua Oliver,bc9dc084a859e6b840b69d6146c646df,"2,4E+16"
9,"Greengairs Landfill Site, Airdrie",UK.SEPA/200000116.Facility,United Kingdom,Airdrie,EUROPE,5,Waste and wastewater management,5(d),972000,AIR,...,2019,"2,11E+15","2,16E+16","1,7E+15",2E+16,"1,87E+15",19,Zachary Randolph,12ab217b40114f8deef2237c7a4d37ae,"2,73E+16"


From 82 PDFs, 20 could be parsed
Final shape after concatenating APIs (65648, 24)


### Save final df

In [219]:
total_df.columns

Index(['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel',
       'FacilityInspireID', 'facilityName', 'City', 'targetRelease',
       'pollutant', 'reportingYear', 'MONTH', 'DAY', 'CONTINENT',
       'max_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_temp',
       'avg_temp', 'min_temp', 'DAY WITH FOGS', 'REPORTER NAME', 'CITY ID',
       'EPRTRSectorCode', 'EPRTRAnnexIMainActivityCode', 'emissions'],
      dtype='object')