In [1]:
import pandas as pd
from tabula import read_pdf
from tabulate import tabulate
import re

In [2]:
#create dataframe from USDA Plants Database Download
#USDA Plants Database Download includes comprehensive and general list of plants scientific and common names
#used as reference list to identify plant names in parsed pdfs and to pull in common names
plants_df = pd.read_csv('../data/USDA/USDA Plants Database Download.txt')

In [3]:
#clean scientific name, add genus and species columns
plants_df['Genus'] = [x.split()[0] for x in plants_df['Scientific Name with Author']]
plants_df['Species'] = [x.split()[1] for x in plants_df['Scientific Name with Author']]
plants_df['Genus Species'] = plants_df['Genus'] + " " + plants_df['Species']
plants_df.head()

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,Common Name,Family,Genus,Species,Genus Species
0,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,shrubby Indian mallow,Malvaceae,Abutilon,abutiloides,Abutilon abutiloides
1,ABAB,ABAM5,Abutilon americanum (L.) Sweet,,,Abutilon,americanum,Abutilon americanum
2,ABAB,ABJA,Abutilon jacquinii G. Don,,,Abutilon,jacquinii,Abutilon jacquinii
3,ABAB,ABLI,Abutilon lignosum (Cav.) G. Don,,,Abutilon,lignosum,Abutilon lignosum
4,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,abietinella moss,Thuidiaceae,Abietinella,abietina,Abietinella abietina


In [4]:
#simplified data frame of species and common names to merge later on
plants_common = plants_df[['Genus Species', 'Common Name']]
plants_common = plants_common.dropna(subset = ['Common Name']).drop_duplicates(subset = ['Genus Species'])
plants_common

Unnamed: 0,Genus Species,Common Name
0,Abutilon abutiloides,shrubby Indian mallow
4,Abietinella abietina,abietinella moss
7,Abronia alpina,Ramshaw Meadows sand verbena
8,Abies alba,silver fir
9,Abies amabilis,Pacific silver fir
...,...,...
93141,Zygophlebia Bishop,octopus fern
93142,Zygodon gracilis,zygodon moss
93143,Zygodon reinwardtii,Reinwardt's zygodon moss
93145,Zygophlebia sectifrons,octopus fern


In [5]:
#create list of unique plant genera
plants_genera = plants_df.Genus.drop_duplicates().sort_values().to_list()

In [6]:
airpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF AIR POLLUTANTS.pdf", pages = "all", multiple_tables=True)

In [7]:
chlorpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF CHLORINATED SOLVENTS.pdf", pages = "all", multiple_tables=True)

In [8]:
pesticidespdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF PESTICIDES.pdf", pages = "all", multiple_tables=True)

In [9]:
petrolpdf = read_pdf("../data/WildOnes/PHYTOREMEDIATION OF PETROLEUM.pdf", pages = "all", multiple_tables=True)

In [10]:
#create dictionary of species present in pdf by using regex to find word following plant genus name
#genus:[species]

air_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(airpdf), re.IGNORECASE)) > 0:
        air_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(airpdf), re.IGNORECASE)

In [11]:
#clean
air_dict.pop('Acacia')
air_dict.pop('Amelanchier')
air_dict.pop('Tilia')
air_dict.pop('Ulmus')
air_dict['Juniperus'] = ['Juniperus virginiana']

In [12]:
#create dictionary of species present in pdf
#genus:[species]

chlor_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(chlorpdf), re.IGNORECASE)) > 0:
        chlor_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(chlorpdf), re.IGNORECASE)          

In [13]:
#clean
chlor_dict['Liquidambar'][0] = 'Liquidambar styraciflua'
chlor_dict['Platanus'][0] = 'Platanus occidentalis'
chlor_dict['Solidago'][6] = 'Solidago missouriensis'
chlor_dict['Tripsacum'][0] = 'Tripsacum dactyloides'

In [14]:
#clean
chlor_dict.pop('Radula')

['radula ']

In [15]:
#create dictionary of species present in pdf by using regex to find word following plant genus name
#genus:[species]

pesticides_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(pesticidespdf), re.IGNORECASE)) > 0:
        pesticides_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(pesticidespdf), re.IGNORECASE)   

In [16]:
#clean
pesticides_dict['Andropogon'][0] = 'Andropogon gerardii'
pesticides_dict['Ceratophyllum'][0] = 'Ceratophyllum demersum'
pesticides_dict['Elodea'][0] = 'Elodea canadensis'
pesticides_dict['Panicum'][0] = 'Panicum virgatum'
pesticides_dict['Populus'][0] = 'Populus deltoides'
pesticides_dict['Salix'] = ['Salix nigra']
pesticides_dict['Sorghastrum'] = ['Sorghastrum nutans']
pesticides_dict['Tripsacum'] = ['Tripsacum dactyloides']
pesticides_dict['Typha'] = ['Typha domingensis', 'Typha latifolia']

In [17]:
#create dictionary of species present in pdf by using regex to find word following plant genus name
#genus:[species]

petrol_dict = {}

for genus in plants_genera:
    if len(re.findall(f'({genus}\\s\w*?)\W', str(petrolpdf), re.IGNORECASE)) > 0:
        petrol_dict[genus] = re.findall(f'({genus}\\s\w*?)\W', str(petrolpdf), re.IGNORECASE)  

In [18]:
#clean
petrol_dict['Bouteloua'][0] = 'Bouteloua curtipendula'
petrol_dict['Fraxinus'][0] = 'Fraxinus pennsylvanica'
petrol_dict['Robinia'][0] = 'Robinia pseudoacacia'
petrol_dict['Schizachyrium'][0] = 'Schizachyrium scoparium'
petrol_dict['Solidago'][6] = 'Solidago missouriensis'
petrol_dict['Tripsacum'][0] = 'Tripsacum dactyloides'

In [19]:
#clean
petrol_dict.pop('Acacia')
petrol_dict.pop('Erinus')
petrol_dict.pop('Hystrix')
petrol_dict.pop('Radula')

['radula TPH']

In [20]:
# heavy metals pdf special case - multiple categories within pdf that are better to leave distinct
# manually created dictionaries - example of code that was initially attempted before deciding manual

# file = "../data/WildOnes/PHYTOREMEDIATION OF HEAVY METALS.pdf"
# metalsall_df = read_pdf(file, lattice = True, pages="all")
# cd_df = read_pdf(file, lattice = True, pages=[1,2,3])
# cr_df = read_pdf(file, lattice = True, pages=4)
# cu_df = read_pdf(file, lattice = True, pages=5)
# pb_df = read_pdf(file, lattice = True, pages=6)
# ni_df = read_pdf(file, lattice = True, pages=7)
# zn_df = read_pdf(file, lattice = True, pages=8)

In [21]:
# example of code that was initially attempted before deciding manual
# for genus in plants_genera:
#        if len(re.findall(f'({genus}\\s[a-z]*?)\W', str(cr_df))) > 0:
#         cr_dict[genus] = re.findall(f'({genus}\\s[a-z]*?)\W', str(cr_df))  

In [22]:
cd_dict = {}
cd_dict['Achillea'] = ['Achillea millefolium']
cd_dict['Carex'] = ['Carex vulpinoidea']
cd_dict['Conyza'] = ['Conyza canadensis']
cd_dict['Elodea'] = ['Elodea canadensis']
cd_dict['Eupatorium'] = ['Eupatorium capillifolium']
cd_dict['Helianthus'] = ['Helianthus annuus', 'Helianthus tuberosus']
cd_dict['Ilex'] = ['Ilex decidua', 'Ilex opaca']
cd_dict['Panicum'] = ['Panicum virgatum']
cd_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
cd_dict['Spartina'] = ['Spartina pectinata']
cd_dict['Spirodela'] = ['Spirodela polyrhiza']
cd_dict['Viola'] = ['Viola bicolor', 'Viola lanceolata', 'Viola missouriensis', 'Viola nephrophylla', 'Viola palmata', 'Viola pedata', 'Viola pubescens', 'Viola sagittata', 'Viola sororia', 'Viola striata', 'Viola subsinuata', 'Viola villosa']

In [23]:
cr_dict = {}
cr_dict['Lemna'] = ['Lemna minor']
cr_dict['Nymphaea'] = ['Nymphaea odorata']
cr_dict['Persicaria'] = ['Persicaria lapathifolia']
cr_dict['Salix'] = ['Salix interior',
  'Salix humilis',
  'Salix nigra',
  'Salix humilis',
  'Salix interior',
  'Salix humilis',
  'Salix nigra']

In [24]:
cu_dict = {}
cu_dict['Amorpha'] = ['Amorpha fruticosa']
cu_dict['Andropogon'] = ['Andropogon gerardii']
cu_dict['Bouteloua'] = ['Bouteloua curtipendula']
cu_dict['Carex'] = ['Carex vulpinoidea']
cu_dict['Elodea'] = ['Elodea canadensis']
cu_dict['Helianthus'] = ['Helianthus annuus']
cu_dict['Lemna'] = ['Lemna minor']
cu_dict['Nymphaea'] = ['Nymphaea odorata']
cu_dict['Persicaria'] = ['Persicaria lapathifolia']
cu_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
cu_dict['Schizachyrium'] = ['Schizachyrium scoparium']
cu_dict['Spartina'] = ['Spartina pectinata']

In [25]:
pb_dict = {}
pb_dict['Ambrosia'] = ['Ambrosia artemisiifolia']
pb_dict['Amorpha'] = ['Amorpha fruticosa']
pb_dict['Bouteloua'] = ['Bouteloua curtipendula']
pb_dict['Carex'] = ['Carex vulpinoidea']
pb_dict['Elodea'] = ['Elodea canadensis']
pb_dict['Gleditsia'] = ['Gleditsia triacanthos']
pb_dict['Lemna'] = ['Lemna minor']
pb_dict['Persicaria'] = ['Persicaria lapathifolia']
pb_dict['Panicum'] = ['Panicum virgatum']
pb_dict['Tripsacum'] = ['Tripsacum dactyloides']
pb_dict['Typha'] = ['Typha latifolia']

In [26]:
ni_dict = {}
ni_dict['Conyza'] = ['Conyza canadensis']
ni_dict['Eupatorium'] = ['Eupatorium capillifolium']
ni_dict['Helianthus'] = ['Helianthus annuus']
ni_dict['Lemna'] = ['Lemna minor']
ni_dict['Robinia'] = ['Robinia pseudoacacia']
ni_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
ni_dict['Packera'] = ['Packera pauperculus']
ni_dict['Solidago'] = ['Solidago hispida']

In [27]:
zn_dict = {}
zn_dict['Bouteloua'] = ['Bouteloua curtipendula']
zn_dict['Conyza'] = ['Conyza canadensis']
zn_dict['Helianthus'] = ['Helianthus annuus']
zn_dict['Salix'] = ['Salix caroliniana', 'Salix eriocephala', 'Salix interior', 'Salix humilis', 'Salix nigra']
zn_dict['Tripsacum'] = ['Tripsacum dactyloides']

In [28]:
#create list of genera in each category
air_keys = list(air_dict.keys())
cd_keys = list(cd_dict.keys())
chlor_keys = list(chlor_dict.keys())
cr_keys = list(cr_dict.keys())
cu_keys = list(cu_dict.keys())
ni_keys = list(ni_dict.keys())
pb_keys = list(pb_dict.keys())
pesticides_keys = list(pesticides_dict.keys())
petrol_keys = list(petrol_dict.keys())
zn_keys = list(zn_dict.keys())

In [29]:
#create list of all genera in all categories to create dataframe
phyto_genera = []

for genus in plants_genera:
    if genus in air_keys:
        phyto_genera.append(genus)
    
    if genus not in phyto_genera and genus in cd_keys:
        phyto_genera.append(genus)
    
    if genus not in phyto_genera and genus in chlor_keys:
        phyto_genera.append(genus)
     
    if genus not in phyto_genera and genus in cr_keys:
        phyto_genera.append(genus)         
     
    if genus not in phyto_genera and genus in cu_keys:
        phyto_genera.append(genus)
     
    if genus not in phyto_genera and genus in ni_keys:
        phyto_genera.append(genus)
     
    if genus not in phyto_genera and genus in pb_keys:
        phyto_genera.append(genus)
    
    if genus not in phyto_genera and genus in pesticides_keys:
        phyto_genera.append(genus)
    
    if genus not in phyto_genera and genus in petrol_keys:
        phyto_genera.append(genus)
    
    if genus not in phyto_genera and genus in zn_keys:
        phyto_genera.append(genus)

In [30]:
#dataframe of all genera in all phytoremediation categories
phyto_df = pd.DataFrame(phyto_genera)
phyto_df.columns = ["genera"]
phyto_df.head(3)

Unnamed: 0,genera
0,Acer
1,Achillea
2,Ambrosia


In [31]:
#create column for each pollution category
phyto_df['air'] = ""
phyto_df['cd'] = ""
phyto_df['chlor'] = ""
phyto_df['cr'] = ""
phyto_df['cu'] = ""
phyto_df['ni'] = ""
phyto_df['pb'] = ""
phyto_df['pesticides'] = ""
phyto_df['petrol'] = ""
phyto_df['zn'] = ""
phyto_df['known_uses_count'] = ""

In [32]:
#species list of lists
species_lol = []

In [33]:
#count number of species known to remediate each pollution category
#count number of categories each genus is known to remediate
for row_value, row_index in phyto_df.iterrows():
    count = 0
    if row_index['genera'] in air_dict:
        phyto_df.at[row_value, 'air'] = len(air_dict[row_index['genera']])
        count += 1
        species_lol.append(air_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'air'] = 0
    
    if row_index['genera'] in cd_dict:
        phyto_df.at[row_value, 'cd'] = len(cd_dict[row_index['genera']])
        count += 1
        species_lol.append(cd_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'cd'] = 0
        
    if row_index['genera'] in chlor_dict:
        phyto_df.at[row_value, 'chlor'] = len(chlor_dict[row_index['genera']])
        count += 1
        species_lol.append(chlor_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'chlor'] = 0    
        
    if row_index['genera'] in cr_dict:
        phyto_df.at[row_value, 'cr'] = len(cr_dict[row_index['genera']])
        count += 1
        species_lol.append(cr_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'cr'] = 0   
        
    if row_index['genera'] in cu_dict:
        phyto_df.at[row_value, 'cu'] = len(cu_dict[row_index['genera']])
        count += 1
        species_lol.append(cu_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'cu'] = 0    
        
    if row_index['genera'] in ni_dict:
        phyto_df.at[row_value, 'ni'] = len(ni_dict[row_index['genera']])
        count += 1
        species_lol.append(ni_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'ni'] = 0
    
    if row_index['genera'] in pb_dict:
        phyto_df.at[row_value, 'pb'] = len(pb_dict[row_index['genera']])
        count += 1
        species_lol.append(pb_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'pb'] = 0
        
    if row_index['genera'] in pesticides_dict:
        phyto_df.at[row_value, 'pesticides'] = len(pesticides_dict[row_index['genera']])
        count += 1
        species_lol.append(pesticides_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'pesticides'] = 0    
        
    if row_index['genera'] in petrol_dict:
        phyto_df.at[row_value, 'petrol'] = len(petrol_dict[row_index['genera']])
        count += 1
        species_lol.append(petrol_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'petrol'] = 0   
        
    if row_index['genera'] in zn_dict:
        phyto_df.at[row_value, 'zn'] = len(zn_dict[row_index['genera']])
        count += 1
        species_lol.append(zn_dict[row_index['genera']])
    else:
        phyto_df.at[row_value, 'zn'] = 0        
    
    row_index['known_uses_count'] = count

In [34]:
#rename
phyto_genera_df = phyto_df
phyto_genera_df.head()

Unnamed: 0,genera,air,cd,chlor,cr,cu,ni,pb,pesticides,petrol,zn,known_uses_count
0,Acer,0,0,1,0,0,0,0,0,0,0,1
1,Achillea,0,1,0,0,0,0,0,0,0,0,1
2,Ambrosia,0,0,0,0,0,0,1,0,0,0,1
3,Amorpha,0,0,0,0,1,0,1,0,0,0,2
4,Andropogon,0,0,0,0,1,0,0,1,1,0,3


In [35]:
#create flat list of unique species from list of lists
species_list = [item for sublist in species_lol for item in sublist]
phyto_species = []
[phyto_species.append(item) for item in species_list if item not in phyto_species]
phyto_species[0:3]

['Acer saccharinum', 'Achillea millefolium', 'Ambrosia artemisiifolia']

In [36]:
#create genus list
sl2 = [x[0:x.index(" ")] for x in phyto_species]

In [37]:
#create dataframe based on genus and species
#create column for each pollution category
species_df = pd.DataFrame({'genus' : sl2, 'species' : phyto_species})
species_df['air'] = ""
species_df['cd'] = ""
species_df['chlor'] = ""
species_df['cr'] = ""
species_df['cu'] = ""
species_df['ni'] = ""
species_df['pb'] = ""
species_df['pesticides'] = ""
species_df['petrol'] = ""
species_df['zn'] = ""
species_df['known_uses_count'] = ""

In [38]:
#count number of species known to remediate each pollution category
#count number of categories each genus is known to remediate
for row_value, row_index in species_df.iterrows():
    count = 0
    if row_index['genus'] in air_dict.keys() and row_index['species'] in air_dict[row_index['genus']]:
        species_df.at[row_value, 'air'] = 'air'
        count += 1
    
    if row_index['genus'] in cd_dict.keys() and row_index['species'] in cd_dict[row_index['genus']]:
        species_df.at[row_value, 'cd'] = 'cd'
        count += 1
        
    if row_index['genus'] in chlor_dict.keys() and row_index['species'] in chlor_dict[row_index['genus']]:
        species_df.at[row_value, 'chlor'] = 'chlor'
        count += 1    
            
    if row_index['genus'] in cr_dict.keys() and row_index['species'] in cr_dict[row_index['genus']]:
        species_df.at[row_value, 'cr'] = 'cr'
        count += 1 
            
    if row_index['genus'] in cu_dict.keys() and row_index['species'] in cu_dict[row_index['genus']]:
        species_df.at[row_value, 'cu'] = 'cu'
        count += 1     
            
    if row_index['genus'] in ni_dict.keys() and row_index['species'] in ni_dict[row_index['genus']]:
        species_df.at[row_value, 'ni'] = 'ni'
        count += 1     
            
    if row_index['genus'] in pb_dict.keys() and row_index['species'] in pb_dict[row_index['genus']]:
        species_df.at[row_value, 'pb'] = 'pb'
        count += 1     
            
    if row_index['genus'] in pesticides_dict.keys() and row_index['species'] in pesticides_dict[row_index['genus']]:
        species_df.at[row_value, 'pesticides'] = 'pest'
        count += 1 
           
    if row_index['genus'] in petrol_dict.keys() and row_index['species'] in petrol_dict[row_index['genus']]:
        species_df.at[row_value, 'petrol'] = 'petrol'
        count += 1      
           
    if row_index['genus'] in zn_dict.keys() and row_index['species'] in zn_dict[row_index['genus']]:
        species_df.at[row_value, 'zn'] = 'zn'
        count += 1      
        
    row_index['known_uses_count'] = count    
        
species_df.head()

Unnamed: 0,genus,species,air,cd,chlor,cr,cu,ni,pb,pesticides,petrol,zn,known_uses_count
0,Acer,Acer saccharinum,,,chlor,,,,,,,,1
1,Achillea,Achillea millefolium,,cd,,,,,,,,,1
2,Ambrosia,Ambrosia artemisiifolia,,,,,,,pb,,,,1
3,Amorpha,Amorpha fruticosa,,,,,cu,,pb,,,,2
4,Andropogon,Andropogon gerardii,,,,,cu,,,pest,petrol,,3


In [39]:
#merge to pull in common name
plants_merge = pd.merge(plants_common, species_df, left_on = 'Genus Species', right_on = 'species', how = 'right')

In [40]:
#clean
phyto_species_df = plants_merge[['genus','species','Common Name', 'air', 'cd', 'chlor', 'cr', 'cu', 'ni', 'pb', 'pesticides', 'petrol', 'zn', 'known_uses_count']]
phyto_species_df = phyto_species_df.rename(columns = {'Common Name' : 'common name'})

In [41]:
phyto_genera_df['genera'] = phyto_genera_df['genera'].str.strip()
phyto_species_df['genus'] = phyto_species_df['genus'].str.strip()
phyto_species_df['species'] = phyto_species_df['species'].str.strip()

In [42]:
#final products:
#phyto_species_df
#phyto_genera_df

phyto_species_df.head()

Unnamed: 0,genus,species,common name,air,cd,chlor,cr,cu,ni,pb,pesticides,petrol,zn,known_uses_count
0,Acer,Acer saccharinum,silver maple,,,chlor,,,,,,,,1
1,Achillea,Achillea millefolium,common yarrow,,cd,,,,,,,,,1
2,Ambrosia,Ambrosia artemisiifolia,annual ragweed,,,,,,,pb,,,,1
3,Amorpha,Amorpha fruticosa,false indigo bush,,,,,cu,,pb,,,,2
4,Andropogon,Andropogon gerardii,big bluestem,,,,,cu,,,pest,petrol,,3


In [43]:
phyto_genera_df.head()

Unnamed: 0,genera,air,cd,chlor,cr,cu,ni,pb,pesticides,petrol,zn,known_uses_count
0,Acer,0,0,1,0,0,0,0,0,0,0,1
1,Achillea,0,1,0,0,0,0,0,0,0,0,1
2,Ambrosia,0,0,0,0,0,0,1,0,0,0,1
3,Amorpha,0,0,0,0,1,0,1,0,0,0,2
4,Andropogon,0,0,0,0,1,0,0,1,1,0,3


In [44]:
#commenting out since files already generated
#phyto_genera_df.to_csv('../data/output csv/phyto_genera_trim.csv')
#phyto_species_df.to_csv('../data/output csv/phyto_species_trim.csv')