In [1]:
import pandas as pd
from tabula import read_pdf
from tabulate import tabulate
import re

In [2]:
#create dataframe from USDA Plants Database Download
plants_df = pd.read_csv('../data/USDA/USDA Plants Database Download.txt')

In [3]:
#clean scientific name, add genus and species columns
plants_df['Genus'] = [x.split()[0] for x in plants_df['Scientific Name with Author']]
plants_df['Species'] = [x.split()[1] for x in plants_df['Scientific Name with Author']]
plants_df.head()

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,Common Name,Family,Genus,Species
0,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,shrubby Indian mallow,Malvaceae,Abutilon,abutiloides
1,ABAB,ABAM5,Abutilon americanum (L.) Sweet,,,Abutilon,americanum
2,ABAB,ABJA,Abutilon jacquinii G. Don,,,Abutilon,jacquinii
3,ABAB,ABLI,Abutilon lignosum (Cav.) G. Don,,,Abutilon,lignosum
4,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,abietinella moss,Thuidiaceae,Abietinella,abietina


In [4]:
#create list of unique plant genera
plants_genera = plants_df.Genus.drop_duplicates().sort_values().to_list()

In [5]:
airpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF AIR POLLUTANTS.pdf", pages = "all", multiple_tables=True)

In [6]:
chlorpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF CHLORINATED SOLVENTS.pdf", pages = "all", multiple_tables=True)

In [7]:
pesticidespdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF PESTICIDES.pdf", pages = "all", multiple_tables=True)

In [8]:
petrolpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF PETROLEUM.pdf", pages = "all", multiple_tables=True)

In [33]:
#create dictionary of species present in pdf
#genus:[species]

air_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(airpdf), re.IGNORECASE)) > 0:
        air_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(airpdf), re.IGNORECASE)   

In [34]:
#clean
air_dict.pop('Acacia')
air_dict.pop('Amelanchier')
air_dict.pop('Tilia')
air_dict.pop('Ulmus')
air_dict['Juniperus'] = ['Juniperus virginiana']
air_dict

{'Fraxinus': ['Fraxinus pennsylvanica'],
 'Juniperus': ['Juniperus virginiana'],
 'Physocarpus': ['Physocarpus opulifolius'],
 'Pinus': ['Pinus echinata'],
 'Quercus': ['Quercus rubra'],
 'Robinia': ['Robinia pseudoacacia']}

In [12]:
#create dictionary of species present in pdf
#genus:[species]

chlor_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(chlorpdf), re.IGNORECASE)) > 0:
        chlor_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(chlorpdf), re.IGNORECASE)          

In [13]:
#clean
chlor_dict['Liquidambar'][0] = 'Liquidambar styraciflua'
chlor_dict['Platanus'][0] = 'Platanus occidentalis'
chlor_dict['Solidago'][6] = 'Solidago missouriensis'
chlor_dict['Tripsacum'][0] = 'Tripsacum dactyloides'

In [14]:
#clean
chlor_dict.pop('Radula')

['radula ']

In [16]:
#create dictionary of species present in pdf
#genus:[species]

pesticides_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(pesticidespdf), re.IGNORECASE)) > 0:
        pesticides_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(pesticidespdf), re.IGNORECASE)   

In [17]:
#clean
pesticides_dict['Andropogon'][0] = 'Andropogon gerardii'
pesticides_dict['Ceratophyllum'][0] = 'Ceratophyllum demersum'
pesticides_dict['Elodea'][0] = 'Elodea canadensis'
pesticides_dict['Panicum'][0] = 'Panicum virgatum'
pesticides_dict['Populus'][0] = 'Populus deltoides'
pesticides_dict['Salix'] = ['Salix nigra']
pesticides_dict['Sorghastrum'] = ['Sorghastrum nutans']
pesticides_dict['Tripsacum'] = ['Tripsacum dactyloides']
pesticides_dict['Typha'] = ['Typha domingensis', 'Typha latifolia']
pesticides_dict

{'Andropogon': ['Andropogon gerardii'],
 'Betula': ['Betula nigra'],
 'Ceratophyllum': ['Ceratophyllum demersum'],
 'Elodea': ['Elodea canadensis'],
 'Juncus': ['Juncus effusus'],
 'Lemna': ['Lemna minor'],
 'Morus': ['Morus rubra'],
 'Panicum': ['Panicum virgatum'],
 'Populus': ['Populus deltoides'],
 'Salix': ['Salix nigra'],
 'Sorghastrum': ['Sorghastrum nutans'],
 'Tripsacum': ['Tripsacum dactyloides'],
 'Typha': ['Typha domingensis', 'Typha latifolia']}

In [18]:
#create dictionary of species present in pdf
#genus:[species]

petrol_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(petrolpdf), re.IGNORECASE)) > 0:
        petrol_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(petrolpdf), re.IGNORECASE)  

In [19]:
#clean
petrol_dict['Bouteloua'][0] = 'Bouteloua curtipendula'
petrol_dict['Fraxinus'][0] = 'Fraxinus pennsylvanica'
petrol_dict['Robinia'][0] = 'Robinia pseudoacacia'
petrol_dict['Schizachyrium'][0] = 'Schizachyrium scoparium'
petrol_dict['Solidago'][6] = 'Solidago missouriensis'
petrol_dict['Tripsacum'][0] = 'Tripsacum dactyloides'

In [20]:
#clean
petrol_dict.pop('Acacia')
petrol_dict.pop('Erinus')
petrol_dict.pop('Hystrix')
petrol_dict.pop('Radula')

['radula TPH']

In [21]:
petrol_dict

{'Andropogon': ['Andropogon gerardii'],
 'Bouteloua': ['Bouteloua curtipendula', 'Bouteloua gracilis'],
 'Carex': ['Carex cephalophora', 'Carex stricta'],
 'Celtis': ['Celtis occidentalis'],
 'Cercis': ['Cercis canadensis'],
 'Elymus': ['Elymus canadensis', 'Elymus hystrix'],
 'Fraxinus': ['Fraxinus pennsylvanica'],
 'Gleditsia': ['Gleditsia triacanthos'],
 'Helianthus': ['Helianthus annuus'],
 'Juncus': ['Juncus effusus'],
 'Juniperus': ['Juniperus virginiana'],
 'Morus': ['Morus rubra'],
 'Panicum': ['Panicum virgatum'],
 'Pinus': ['Pinus echinata'],
 'Populus': ['Populus deltoides'],
 'Quercus': ['Quercus macrocarpa', 'Quercus phellos'],
 'Robinia': ['Robinia pseudoacacia'],
 'Sagittaria': ['Sagittaria latifolia'],
 'Salix': ['Salix caroliniana',
  'Salix eriocephala',
  'Salix humilis',
  'Salix interior',
  'Salix nigra'],
 'Schizachyrium': ['Schizachyrium scoparium'],
 'Scirpus': ['Scirpus atrovirens',
  'Scirpus cyperinus',
  'Scirpus georgianus',
  'Scirpus pendulus'],
 'Senna'

In [22]:
# heavy metals pdf special case - manually created dictionaries
# example of code that was initially attempted before deciding manual

# file = "../data/WildOnes/PHYTOREMEDIATION OF HEAVY METALS.pdf"
# metalsall_df = read_pdf(file, lattice = True, pages="all")
# cd_df = read_pdf(file, lattice = True, pages=[1,2,3])
# cr_df = read_pdf(file, lattice = True, pages=4)
# cu_df = read_pdf(file, lattice = True, pages=5)
# pb_df = read_pdf(file, lattice = True, pages=6)
# ni_df = read_pdf(file, lattice = True, pages=7)
# zn_df = read_pdf(file, lattice = True, pages=8)

In [24]:
# example of code that was initially attempted before deciding manual
# for genus in plants_genera:
#        if len(re.findall(f'({genus}\\s[a-z]*?)\W', str(cr_df))) > 0:
#         cr_dict[genus] = re.findall(f'({genus}\\s[a-z]*?)\W', str(cr_df))  

In [25]:
cd_dict = {}
cd_dict['Achillea'] = ['Achillea millefolium']
cd_dict['Carex'] = ['Carex vulpinoidea']
cd_dict['Conyza'] = ['Conyza canadensis']
cd_dict['Elodea'] = ['Elodea canadensis']
cd_dict['Eupatorium'] = ['Eupatorium capillifolium']
cd_dict['Helianthus'] = ['Helianthus Annuus', 'Helianthus tuberosus']
cd_dict['Ilex'] = ['Ilex decidua', 'Ilex opaca']
cd_dict['Panicum'] = ['Panicum virgatum']
cd_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
cd_dict['Spartina'] = ['Spartina petinata']
cd_dict['Spirodela'] = ['Spirodela polyrhiza']
cd_dict['Viola'] = ['Viola bicolor', 'Viola lanceolata', 'Viola missouriensis', 'Viola nephrophylla', 'Viola palmata', 'Viola pedata', 'Viola pubescens', 'Viola sagittata', 'Viola sororia', 'Viola striata', 'Viola subsinuata', 'Viola villosa']
cd_dict

{'Achillea': ['Achillea millefolium'],
 'Carex': ['Carex vulpinoidea'],
 'Conyza': ['Conyza canadensis'],
 'Elodea': ['Elodea canadensis'],
 'Eupatorium': ['Eupatorium capillifolium'],
 'Helianthus': ['Helianthus Annuus', 'Helianthus tuberosus'],
 'Ilex': ['Ilex decidua', 'Ilex opaca'],
 'Panicum': ['Panicum virgatum'],
 'Salix': ['Salix caroliniana',
  'Salix eriocephala',
  'Salix interior',
  'Salix humilis',
  'Salix nigra'],
 'Spartina': ['Spartina petinata'],
 'Spirodela': ['Spirodela polyrhiza'],
 'Viola': ['Viola bicolor',
  'Viola lanceolata',
  'Viola missouriensis',
  'Viola nephrophylla',
  'Viola palmata',
  'Viola pedata',
  'Viola pubescens',
  'Viola sagittata',
  'Viola sororia',
  'Viola striata',
  'Viola subsinuata',
  'Viola villosa']}

In [26]:
cr_dict = {}
cr_dict['Lemna'] = ['Lemna minor']
cr_dict['Nymphaea'] = ['Nymphaea odorata']
cr_dict['Persicaria'] = ['Persicaria lapathifolia']
cr_dict['Salix'] = ['Salix interior',
  'Salix humilis',
  'Salix nigra',
  'Salix humilis',
  'Salix interior',
  'Salix humilis',
  'Salix nigra']
cr_dict

{'Lemna': ['Lemna minor'],
 'Nymphaea': ['Nymphaea odorata'],
 'Persicaria': ['Persicaria lapathifolia'],
 'Salix': ['Salix interior',
  'Salix humilis',
  'Salix nigra',
  'Salix humilis',
  'Salix interior',
  'Salix humilis',
  'Salix nigra']}

In [27]:
cu_dict = {}
cu_dict['Amorpha'] = ['Amorpha fruticosa']
cu_dict['Andropogon'] = ['Andropogon gerardii']
cu_dict['Bouteloua'] = ['Bouteloua curtipendula']
cu_dict['Carex'] = ['Carex vulpinoidea']
cu_dict['Elodea'] = ['Elodea canadensis']
cu_dict['Helianthus'] = ['Helianthus annuus']
cu_dict['Lemna'] = ['Lemna minor']
cu_dict['Nymphaea'] = ['Nymphaea odorata']
cu_dict['Persicaria'] = ['Persicaria lapathifolia']
cu_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
cu_dict['Schizachyrium'] = ['Schizachyrium scoparium']
cu_dict['Spartina'] = ['Spartina pectinata']
cu_dict

{'Amorpha': ['Amorpha fruticosa'],
 'Andropogon': ['Andropogon gerardii'],
 'Bouteloua': ['Bouteloua curtipendula'],
 'Carex': ['Carex vulpinoidea'],
 'Elodea': ['Elodea canadensis'],
 'Helianthus': ['Helianthus annuus'],
 'Lemna': ['Lemna minor'],
 'Nymphaea': ['Nymphaea odorata'],
 'Persicaria': ['Persicaria lapathifolia'],
 'Salix': ['Salix caroliniana',
  'Salix eriocephala',
  'Salix interior',
  'Salix humilis',
  'Salix nigra'],
 'Schizachyrium': ['Schizachyrium scoparium'],
 'Spartina': ['Spartina pectinata']}

In [28]:
pb_dict = {}
pb_dict['Ambrosia'] = ['Abrosia artemisiifolia']
pb_dict['Amorpha'] = ['Amorpha fruticosa']
pb_dict['Bouteloua'] = ['Bouteloua curtipendula']
pb_dict['Carex'] = ['Carex vulpinoidea']
pb_dict['Elodea'] = ['Elodea canadensis']
pb_dict['Gleditsia'] = ['Gleditsia triacanthos']
pb_dict['Lemna'] = ['Lemna minor']
pb_dict['Persicaria'] = ['Persicaria lapathifolia']
pb_dict['Panicum'] = ['Panicum virgatum']
pb_dict['Tripsacum'] = ['Tripsacum dactyloides']
pb_dict['Typha'] = ['Typa latifolia']
pb_dict

{'Ambrosia': ['Abrosia artemisiifolia'],
 'Amorpha': ['Amorpha fruticosa'],
 'Bouteloua': ['Bouteloua curtipendula'],
 'Carex': ['Carex vulpinoidea'],
 'Elodea': ['Elodea canadensis'],
 'Gleditsia': ['Gleditsia triacanthos'],
 'Lemna': ['Lemna minor'],
 'Persicaria': ['Persicaria lapathifolia'],
 'Panicum': ['Panicum virgatum'],
 'Tripsacum': ['Tripsacum dactyloides'],
 'Typha': ['Typa latifolia']}

In [29]:
ni_dict = {}
ni_dict['Conyza'] = ['Conyza canadensis']
ni_dict['Eupatorium'] = ['Eupatorium capillifolium']
ni_dict['Helianthus'] = ['Helianthus annuus']
ni_dict['Lemna'] = ['Lemna minor']
ni_dict['Robinia'] = ['Robinia pseudoacacia']
ni_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
ni_dict['Packera'] = ['Packera pauperculus']
ni_dict['Solidago'] = ['Solidago hispida']
ni_dict

{'Conyza': ['Conyza canadensis'],
 'Eupatorium': ['Eupatorium capillifolium'],
 'Helianthus': ['Helianthus annuus'],
 'Lemna': ['Lemna minor'],
 'Robinia': ['Robinia pseudoacacia'],
 'Salix': ['Salix caroliniana',
  'Salix eriocephala',
  'Salix interior',
  'Salix humilis',
  'Salix nigra'],
 'Packera': ['Packera pauperculus'],
 'Solidago': ['Solidago hispida']}

In [30]:
zn_dict = {}
zn_dict['Bouteloua'] = ['Bouteloua curtipendula']
zn_dict['Conyza'] = ['Conyza canadensis']
zn_dict['Helianthus'] = ['Helianthus annuus']
zn_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
zn_dict['Tripsacum'] = ['Tripsacum dactyloides']
zn_dict

{'Bouteloua': ['Bouteloua curtipendula'],
 'Conyza': ['Conyza canadensis'],
 'Helianthus': ['Helianthus annuus'],
 'Salix': ['Salix caroliniana',
  'Salix eriocephala',
  'Salix interior',
  'Salix humilis',
  'Salix nigra'],
 'Tripsacum': ['Tripsacum dactyloides']}

In [None]:
#  List of dictionaries:
#  'air_dict',
#  'cd_dict',
#  'chlor_dict',
#  'cr_dict',
#  'cu_dict',
#  'ni_dict',
#  'pb_dict',
#  'pesticides_dict',
#  'petrol_dict',
#  'zn_dict'