In [1]:
import pandas as pd
from tabula import read_pdf
from tabulate import tabulate
import re

In [2]:
#create dataframe from USDA Plants Database Download
plants_df = pd.read_csv('../data/USDA/USDA Plants Database Download.txt')

In [3]:
#clean scientific name, add genus and species columns
plants_df['Genus'] = [x.split()[0] for x in plants_df['Scientific Name with Author']]
plants_df['Species'] = [x.split()[1] for x in plants_df['Scientific Name with Author']]
plants_df.head()

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,Common Name,Family,Genus,Species
0,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,shrubby Indian mallow,Malvaceae,Abutilon,abutiloides
1,ABAB,ABAM5,Abutilon americanum (L.) Sweet,,,Abutilon,americanum
2,ABAB,ABJA,Abutilon jacquinii G. Don,,,Abutilon,jacquinii
3,ABAB,ABLI,Abutilon lignosum (Cav.) G. Don,,,Abutilon,lignosum
4,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,abietinella moss,Thuidiaceae,Abietinella,abietina


In [4]:
#create list of unique plant genera
plants_genera = plants_df.Genus.drop_duplicates().sort_values().to_list()

In [5]:
airpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF AIR POLLUTANTS.pdf", pages = "all", multiple_tables=True)

In [6]:
chlorpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF CHLORINATED SOLVENTS.pdf", pages = "all", multiple_tables=True)

In [7]:
pesticidespdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF PESTICIDES.pdf", pages = "all", multiple_tables=True)

In [8]:
petrolpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF PETROLEUM.pdf", pages = "all", multiple_tables=True)

In [9]:
#create dictionary of species present in pdf
#genus:[species]

air_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(airpdf), re.IGNORECASE)) > 0:
        air_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(airpdf), re.IGNORECASE)   

In [10]:
#clean
air_dict.pop('Acacia')

['acacia Full']

In [11]:
air_dict

{'Amelanchier': ['Amelanchier arborea'],
 'Fraxinus': ['Fraxinus pennsylvanica'],
 'Juniperus': ['Juniperus virginiana', 'Juniperus virginiana'],
 'Physocarpus': ['Physocarpus opulifolius'],
 'Pinus': ['Pinus echinata'],
 'Quercus': ['Quercus rubra'],
 'Robinia': ['Robinia pseudoacacia'],
 'Tilia': ['Tilia americana'],
 'Ulmus': ['Ulmus alata', 'Ulmus americana', 'Ulmus rubra']}

In [12]:
#create dictionary of species present in pdf
#genus:[species]

chlor_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(chlorpdf), re.IGNORECASE)) > 0:
        chlor_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(chlorpdf), re.IGNORECASE)          

In [13]:
#clean
chlor_dict['Liquidambar'][0] = 'Liquidambar styraciflua'
chlor_dict['Platanus'][0] = 'Platanus occidentalis'
chlor_dict['Solidago'][6] = 'Solidago missouriensis'
chlor_dict['Tripsacum'][0] = 'Tripsacum dactyloides'

In [14]:
#clean
chlor_dict.pop('Radula')

['radula ']

In [15]:
chlor_dict

{'Acer': ['Acer saccharinum'],
 'Betula': ['Betula nigra'],
 'Cercis': ['Cercis canadensis'],
 'Elodea': ['Elodea canadensis'],
 'Liquidambar': ['Liquidambar styraciflua'],
 'Platanus': ['Platanus occidentalis'],
 'Populus': ['Populus deltoides'],
 'Quercus': ['Quercus palustris'],
 'Salix': ['Salix nigra'],
 'Solidago': ['Solidago altissima',
  'Solidago arguta',
  'Solidago caesia',
  'Solidago flexicaulis',
  'Solidago gigantea',
  'Solidago hispida',
  'Solidago missouriensis',
  'Solidago nemoralis',
  'Solidago odora',
  'Solidago petiolaris',
  'Solidago radula',
  'Solidago rigida',
  'Solidago rugosa',
  'Solidago speciosa',
  'Solidago ulmifolia'],
 'Tripsacum': ['Tripsacum dactyloides'],
 'Typha': ['Typha domingensis', 'Typha latifolia']}

In [16]:
#create dictionary of species present in pdf
#genus:[species]

pesticides_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(pesticidespdf), re.IGNORECASE)) > 0:
        pesticides_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(pesticidespdf), re.IGNORECASE)   

In [17]:
#clean
pesticides_dict['Andropogon'][0] = 'Andropogon gerardii'
pesticides_dict['Ceratophyllum'][0] = 'Ceratophyllum demersum'
pesticides_dict['Elodea'][0] = 'Elodea canadensis'
pesticides_dict['Panicum'][0] = 'Panicum virgatum'
pesticides_dict['Populus'][0] = 'Populus deltoides'

In [18]:
pesticides_dict

{'Andropogon': ['Andropogon gerardii'],
 'Betula': ['Betula nigra'],
 'Ceratophyllum': ['Ceratophyllum demersum'],
 'Elodea': ['Elodea canadensis'],
 'Juncus': ['Juncus effusus'],
 'Lemna': ['Lemna minor'],
 'Morus': ['Morus rubra'],
 'Panicum': ['Panicum virgatum'],
 'Populus': ['Populus deltoides']}

In [19]:
#create dictionary of species present in pdf
#genus:[species]

petrol_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(petrolpdf), re.IGNORECASE)) > 0:
        petrol_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(petrolpdf), re.IGNORECASE)  

In [20]:
#clean
petrol_dict['Bouteloua'][0] = 'Bouteloua curtipendula'
petrol_dict['Fraxinus'][0] = 'Fraxinus pennsylvanica'
petrol_dict['Robinia'][0] = 'Robinia pseudoacacia'
petrol_dict['Schizachyrium'][0] = 'Schizachyrium scoparium'
petrol_dict['Solidago'][6] = 'Solidago missouriensis'
petrol_dict['Tripsacum'][0] = 'Tripsacum dactyloides'

In [21]:
#clean
petrol_dict.pop('Acacia')
petrol_dict.pop('Erinus')
petrol_dict.pop('Hystrix')
petrol_dict.pop('Radula')

['radula TPH']

In [22]:
petrol_dict

{'Andropogon': ['Andropogon gerardii'],
 'Bouteloua': ['Bouteloua curtipendula', 'Bouteloua gracilis'],
 'Carex': ['Carex cephalophora', 'Carex stricta'],
 'Celtis': ['Celtis occidentalis'],
 'Cercis': ['Cercis canadensis'],
 'Elymus': ['Elymus canadensis', 'Elymus hystrix'],
 'Fraxinus': ['Fraxinus pennsylvanica'],
 'Gleditsia': ['Gleditsia triacanthos'],
 'Helianthus': ['Helianthus annuus'],
 'Juncus': ['Juncus effusus'],
 'Juniperus': ['Juniperus virginiana'],
 'Morus': ['Morus rubra'],
 'Panicum': ['Panicum virgatum'],
 'Pinus': ['Pinus echinata'],
 'Populus': ['Populus deltoides'],
 'Quercus': ['Quercus macrocarpa', 'Quercus phellos'],
 'Robinia': ['Robinia pseudoacacia'],
 'Sagittaria': ['Sagittaria latifolia'],
 'Salix': ['Salix caroliniana',
  'Salix eriocephala',
  'Salix humilis',
  'Salix interior',
  'Salix nigra'],
 'Schizachyrium': ['Schizachyrium scoparium'],
 'Scirpus': ['Scirpus atrovirens',
  'Scirpus cyperinus',
  'Scirpus georgianus',
  'Scirpus pendulus'],
 'Senna'

In [23]:
#heavy metals pdf special case
file = "../data/WildOnes/PHYTOREMEDIATION OF HEAVY METALS.pdf"
metalsall_df = read_pdf(file, lattice = True, pages="all")
cd_df = read_pdf(file, lattice = True, pages=[1,2,3])
cr_df = read_pdf(file, lattice = True, pages=4)
cu_df = read_pdf(file, lattice = True, pages=5)
pb_df = read_pdf(file, lattice = True, pages=6)
ni_df = read_pdf(file, lattice = True, pages=7)
zn_df = read_pdf(file, lattice = True, pages=8)

In [24]:
#create dictionary of species present in pdf
#distinguish which heavy metal each species remediates
#genus:[species]

In [None]:
cd_dict = {}
cr_dict = {}
cu_dict = {}
pb_dict = {}
ni_dict = {}
zn_dict = {}

In [45]:
for genus in plants_genera:
       if len(re.findall(f'({genus}\\s[a-z]*?)\W', str(cr_df))) > 0:
        cr_dict[genus] = re.findall(f'({genus}\\s[a-z]*?)\W', str(cr_df))  

In [46]:
cr_dict

{'Lemna': ['Lemna minor', 'Lemna minor', 'Lemna minor'],
 'Salix': ['Salix interior',
  'Salix humilis',
  'Salix nigra',
  'Salix humilis',
  'Salix interior',
  'Salix humilis',
  'Salix nigra']}