# Goal

The goal is to scrape Wildflower.org for both all it's information and to fill in the missing data in our source data

# Imports and Data

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests 
import re
import numpy as np

In [105]:
full_data = pd.read_excel("../../Data/ERA_Alabama.xlsx",sheet_name="All Plants")

In [4]:
full_data.head()

Unnamed: 0,Plant Type,Scientific Name,Common Name,Plant Family,USDA Symbol,Native Status,Distribution in USA,Workhorse,Flower Color,Showy,...,Butterflies,Monarchs,Nesting and Structure (Bees),Larval Host (Monarch),Larval Host (Butterfly),Larval Host (Moth),Larval Species (Lepidoptera),Hummingbirds,Bats,Wind
0,"Tree, Shrub",Acacia farnesiana,sweet acacia,Fabaceae,ACFA,"L48 (NI), HI (I), PR (N), VI (N)","AL, AZ, CA, FL, GA, HI, LA, MS, NM, TX",Pollinator,,,...,,,,,,Yes,"Amyelois transitella, Ectomyelois ceratoniae, ...",,Yes,
1,Herb (annual),Acalypha rhomboidea,common threeseed mercury,Euphorbiaceae,ACRH,"L48 (N), CAN (N)","AL, AR, CT, DC, DE, FL, GA, IA, IL, IN, KS, KY...",,,Yes,...,,,,,,Yes,Orthonama obstipata,,,
2,Herb (annual),Acalypha virginica,Virginia threeseed mercury,Euphorbiaceae,ACVI,L48 (N),"AL, AR, CT, DC, DE, GA, IA, IL, IN, KS, KY, LA...",,Red,Yes,...,,,,,,,,,,
3,Tree,Acer floridanum,southern sugar maple,Aceraceae,ACFL,L48 (N),"AL, AR, FL, GA, IL, KY, LA, MO, MS, NC, OK, SC...",,Yellow,No,...,,,,,,,,,,Yes
4,Tree,Acer negundo,boxelder,Aceraceae,ACNE2,"L48 (N), CAN (N)","AL, AR, AZ, CA, CO, CT, DC, DE, FL, GA, IA, ID...",Workhorse/Pollinator,White,No,...,,,Yes,,Yes,Yes,"Abagrotis barnesi, Abagrotis orbis, Abagrotis ...",,,Yes


In [5]:
sample_data = full_data.sample(10)

In [6]:
sample_data.head()

Unnamed: 0,Plant Type,Scientific Name,Common Name,Plant Family,USDA Symbol,Native Status,Distribution in USA,Workhorse,Flower Color,Showy,...,Butterflies,Monarchs,Nesting and Structure (Bees),Larval Host (Monarch),Larval Host (Butterfly),Larval Host (Moth),Larval Species (Lepidoptera),Hummingbirds,Bats,Wind
1104,Herb (annual or perennial),Pluchea camphorata,camphor pluchea,Asteraceae,PLCA7,L48 (N),"AL, AR, DE, FL, GA, IL, IN, KS, KY, LA, MD, MO...",,,No,...,,,,,,,,,,
1045,Herb (annual),Phacelia ranunculacea,oceanblue phacelia,Hydrophyllaceae,PHRA3,L48 (N),"AL, AR, IL, IN, KY, MO, OH, TN, WV",,,,...,,,,,,Yes,Ethmia macelhosiella,,,
609,Tree,Fraxinus americana,white ash,Oleaceae,FRAM2,"L48 (N), HI (I), CAN (N)","AL, AR, CO, CT, DC, DE, FL, GA, HI, IA, IL, IN...",Pollinator,Yellow,No,...,,,,,Yes,Yes,"Achatia distincta, Adita chionanthi, Alsophila...",,,
101,Herb (perennial),Aruncus dioicus,bride's feathers,Rosaceae,ARDI8,"L48 (NI), AK (N), CAN (N)","AK, AL, AR, CA, DC, GA, IA, IL, IN, KY, MA, MD...",Workhorse/Pollinator,White,,...,Yes,,,,Yes,Yes,"Celastrina ebenina, Celastrina nigra, Endopiza...",Yes,,
1124,Herb (perennial),Pontederia cordata,pickerelweed,Pontederiaceae,POCO14,"L48 (N), CAN (N)","AL, AR, CT, DC, DE, FL, GA, IA, IL, IN, KS, KY...",Workhorse/Pollinator,Purple,Yes,...,Yes,,,,,Yes,"Bellura densa, Bellura gortynoides, Bellura ob...",Yes,,


In [7]:
sample_condensed = sample_data[["Scientific Name","Common Name","USDA Symbol"]]
sample_condensed.to_csv("test.csv")

# Approach/Notes

Each plant is hosted on a url following a uniform structure of base + USDA symbol - https://www.wildflower.org/plants/result.php?id_plant=EUCY. The initial plan to navigate the site is to iterate through the symbol in our test data, appending each to 
the base url, and use the requests module to pull up the plant specific page. Then, we'll use beautiful soup to scrape the page. 

The focus is on grabbing information under the Synonyms, Plant Characteristics, Bloom Information, Growing Conditions and Benefit headers.

# Building out approach

In [8]:
BASE_URL = "https://www.wildflower.org/plants/result.php?id_plant="
HEADERS = {
    "Accept-Language": "en-US,en;q=0.9",
    "User-Agent": "Chrome/87.0.4280.141"
}

In [9]:
# Get the Page text for the symbol using http request module
full_url = BASE_URL + "EUCY"
response = requests.get(url = full_url, headers = HEADERS)
page = response.text 
soup = BeautifulSoup(page,features='lxml')

The synonyms are in a span tag with each synonym in an italicized tag. They are the only objects of this structure on the page.

In [10]:
soup.select('span i')

[<i>Euphorbia barbellata</i>,
 <i>Euphorbia graminifolia</i>,
 <i>Euphorbia havanensis</i>,
 <i>Euphorbia heterophylla</i>,
 <i>barbellata</i>,
 <i>Euphorbia heterophylla</i>,
 <i>cyathophora</i>,
 <i>Euphorbia heterophylla</i>,
 <i>graminifolia</i>,
 <i>Poinsettia barbellata</i>,
 <i>Poinsettia cyathophora</i>,
 <i>Poinsettia cyathophora</i>,
 <i>graminifolia</i>,
 <i>Poinsettia graminifolia</i>,
 <i>Poinsettia havanensis</i>]

The header (i.e. Plant Characteristics) name is in an h4 tag, the subheader (i.e. Habit) is in a strong tag. The subheaders values (i.e. Herb) are either in an anchor tag or unstructured text

In [11]:
soup.select('div h4')[:5]

[<h4>From the Image Gallery</h4>,
 <h4>Plant Characteristics</h4>,
 <h4>Bloom Information</h4>,
 <h4>Distribution</h4>,
 <h4>Growing Conditions</h4>]

In [12]:
all_sections = soup.find_all('div',class_="section")

In [13]:
list_of_relevant_sections = ["Plant Characteristics", "Bloom Information", "Growing Conditions", "Benefit"]

In [14]:
#Create a dictionary with key = section_name, value = BS object corresponding to that section on the page. 

named_section_dict = {}
for section in all_sections:
    section_name = section.select_one('h4').text
    if section_name in list_of_relevant_sections:
        named_section_dict[section_name] = section



In [15]:
named_section_dict["Benefit"]

<div class="section" style="float:left;width:97.3%;"><h4>Benefit</h4><strong>Use Ornamental:</strong> Accent, Garden.
A good filler plant but can be invasive. <br/><strong>Conspicuous Flowers:</strong> yes<br/><strong>Interesting Foliage:</strong> yes<br/></div>

All text in a strong tag is a subheader, text between the closing strong tag and the br tag are each subheaders value. I'll use regex to grab these

In [16]:
#Subheaders
subheaders = [subheader.text.replace(":","") for subheader in  named_section_dict["Benefit"].select('strong')]
subheaders

['Use Ornamental', 'Conspicuous Flowers', 'Interesting Foliage']

In [17]:
# Subheader Values
subheader_values = re.findall("<\/strong>[\w\s\\.\-\,]+<br\/>",str(named_section_dict["Benefit"]))
subheader_values

['</strong> Accent, Garden.\nA good filler plant but can be invasive. <br/>',
 '</strong> yes<br/>',
 '</strong> yes<br/>']

Now we need to write some code to clean these strings and deal with scenarios when the data is in an anchor tag. 

In [18]:
test_section = str(named_section_dict["Plant Characteristics"])

In [19]:
test_section

'<div class="section" style="float:left;width:97.3%;"><h4>Plant Characteristics</h4><strong>Duration:</strong> <a class="glossary_link" onclick="glossary(this)" title="Annual (A species that grows from seed, flowers, fruits and dies within one year\'s time.  See also, Winter Annual.                                )">Annual</a> <br/><strong>Habit:</strong> <a class="glossary_link" onclick="glossary(this)" title="Herb (A plant species lacking woody tissue when mature.                      )">Herb</a> <br/><strong>Leaf Shape:</strong> <a class="glossary_link" onclick="glossary(this)" title="Oblanceolate (Inversely lanceolate with top wider than bottom.                      )">Oblanceolate</a> <br/><strong>Leaf Pubescence:</strong> <a class="glossary_link" onclick="glossary(this)" title="Glabrous (Smooth; hairless.)">Glabrous</a> <br/><strong>Leaf Texture:</strong> Smooth <br/><strong>Leaf:</strong> The upper or bracteal leaves usually red toward the base. <br/><strong>Flower:</strong> Flo

In [20]:
# Approach to catch text between anchor tags.

string = re.sub("<a.*?>","",test_section) #Remove all characters within starting a tag
string = re.sub("</a>","",string) #Remove all characters within closing a tag

In [21]:
string

'<div class="section" style="float:left;width:97.3%;"><h4>Plant Characteristics</h4><strong>Duration:</strong> Annual <br/><strong>Habit:</strong> Herb <br/><strong>Leaf Shape:</strong> Oblanceolate <br/><strong>Leaf Pubescence:</strong> Glabrous <br/><strong>Leaf Texture:</strong> Smooth <br/><strong>Leaf:</strong> The upper or bracteal leaves usually red toward the base. <br/><strong>Flower:</strong> Flowers 3-5 mm <br/><strong>Fruit:</strong> 4.5-5 mm <br/><strong>Size Class:</strong> 1-3 ft. <br/></div>'

In [22]:
def clean_subheader_values(string):
    string = str(string).replace("</strong>","").replace("<br/>","")
    new_string = ",".join(string.split(" , ")) #Remove spacing with csv's
    return new_string

string = re.sub("<a.*?>","",str(test_section))
string = re.sub("</a>","",string)
subheader_values = [clean_subheader_values(val) for val in re.findall("<\/strong>[\w\s\\.\-\,]+<br\/>",string)]


In [23]:
subheaders = [subheader.text.replace(":","") for subheader in  named_section_dict["Plant Characteristics"].select('strong')]


for head,val in list(zip(subheaders,subheader_values)):
    print(head,":",val)

Duration :  Annual 
Habit :  Herb 
Leaf Shape :  Oblanceolate 
Leaf Pubescence :  Glabrous 
Leaf Texture :  Smooth 
Leaf :  The upper or bracteal leaves usually red toward the base. 
Flower :  Flowers 3-5 mm 
Fruit :  4.5-5 mm 
Size Class :  1-3 ft. 


Repeat process for each section

In [24]:
lst_of_section_subheaders = [[subheader.text.replace(":","") for subheader in  named_section_dict[section].select('strong')] for section in named_section_dict.keys()]

list_of_section_values = [
    [clean_subheader_values(val) for val in re.findall("<\/strong>[\w\s\\.\-\,]+<br\/>",re.sub("</a>","",re.sub("<a.*?>","",str(named_section_dict[section]))))] for section in named_section_dict.keys()]


In [25]:
for section_subheaders_lst, section_values_lst in list(zip(lst_of_section_subheaders,list_of_section_values)):
    for head,val in list(zip(section_subheaders_lst,section_values_lst)):
        print(head,":",val)

Duration :  Annual 
Habit :  Herb 
Leaf Shape :  Oblanceolate 
Leaf Pubescence :  Glabrous 
Leaf Texture :  Smooth 
Leaf :  The upper or bracteal leaves usually red toward the base. 
Flower :  Flowers 3-5 mm 
Fruit :  4.5-5 mm 
Size Class :  1-3 ft. 
Bloom Color :  Yellow,Green 
Bloom Time :  May,Jun,Jul,Aug,Sep,Oct,Nov 
Water Use :  Medium 
Light Requirement :  Sun 
Soil Moisture :  Moist 
Soil Description :  Sandy, Sandy Loam, Medium Loam, Clay Loam 
Use Ornamental :  Accent, Garden.
A good filler plant but can be invasive. 
Conspicuous Flowers :  yes
Interesting Foliage :  yes


Run solution on sample data 

In [26]:
list_of_page_soups = []

for symbol in sample_condensed["USDA Symbol"].to_list():
        
    full_url = BASE_URL + symbol
    response = requests.get(url = full_url, headers = HEADERS)
    list_of_page_soups.append(BeautifulSoup(response.text,features='lxml'))



In [27]:
for page_soup in list_of_page_soups:
    print(page_soup.select('div h4')[:5])

[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, <h4>Distribution</h4>, <h4>National Wetland Indicator Status</h4>]
[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, <h4>Distribution</h4>, <h4>National Wetland Indicator Status</h4>]
[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, <h4>Distribution</h4>, <h4>Growing Conditions</h4>]
[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, <h4>Distribution</h4>, <h4>Growing Conditions</h4>]
[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, <h4>Distribution</h4>, <h4>Growing Conditions</h4>]
[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, <h4>Distribution</h4>, <h4>Growing Conditions</h4>]
[<h4>From the Image Gallery</h4>, <h4>Plant Characteristics</h4>, <h4>Bloom Information</h4>, 

In [29]:
# list_of_relevant_sections = ["Plant Characteristics", "Bloom Information", "Growing Conditions", "Benefit","Value to Beneficial Insects"] Insects section rare and does not fit regex pattern
list_of_relevant_sections = ["Plant Characteristics", "Bloom Information", "Growing Conditions", "Benefit","Propagation"]

for page_soup in list_of_page_soups:

    all_sections = soup.find_all('div',class_="section")

    named_section_dict = {}
    for section in all_sections:
        section_name = section.select_one('h4').text
        if section_name in list_of_relevant_sections:
            named_section_dict[section_name] = section
    
    lst_of_section_subheaders = [[subheader.text.replace(":","") for subheader in  named_section_dict[section].select('strong')] for section in named_section_dict.keys()]

    list_of_section_values = [
        [clean_subheader_values(val) for val in re.findall("<\/strong>[\w\s\\.\-\,]+<br\/>",re.sub("</a>","",re.sub("<a.*?>","",str(named_section_dict[section]))))] for section in named_section_dict.keys()]


    for section_subheaders_lst, section_values_lst in list(zip(lst_of_section_subheaders,list_of_section_values)):
        for head,val in list(zip(section_subheaders_lst,section_values_lst)):
            print(head,":",val)

Duration :  Annual 
Habit :  Herb 
Leaf Shape :  Oblanceolate 
Leaf Pubescence :  Glabrous 
Leaf Texture :  Smooth 
Leaf :  The upper or bracteal leaves usually red toward the base. 
Flower :  Flowers 3-5 mm 
Fruit :  4.5-5 mm 
Size Class :  1-3 ft. 
Bloom Color :  Yellow,Green 
Bloom Time :  May,Jun,Jul,Aug,Sep,Oct,Nov 
Water Use :  Medium 
Light Requirement :  Sun 
Soil Moisture :  Moist 
Soil Description :  Sandy, Sandy Loam, Medium Loam, Clay Loam 
Use Ornamental :  Accent, Garden.
A good filler plant but can be invasive. 
Conspicuous Flowers :  yes
Interesting Foliage :  yes
Duration :  Annual 
Habit :  Herb 
Leaf Shape :  Oblanceolate 
Leaf Pubescence :  Glabrous 
Leaf Texture :  Smooth 
Leaf :  The upper or bracteal leaves usually red toward the base. 
Flower :  Flowers 3-5 mm 
Fruit :  4.5-5 mm 
Size Class :  1-3 ft. 
Bloom Color :  Yellow,Green 
Bloom Time :  May,Jun,Jul,Aug,Sep,Oct,Nov 
Water Use :  Medium 
Light Requirement :  Sun 
Soil Moisture :  Moist 
Soil Description : 

Now we want to find a way to store a single plants data

In [30]:
all_sections = soup.find_all('div',class_="section")

named_section_dict = {}
for section in all_sections:
    section_name = section.select_one('h4').text
    if section_name in list_of_relevant_sections:
        named_section_dict[section_name] = section


lst_of_section_subheaders = []

for section in named_section_dict.keys():
    for subheader in  named_section_dict[section].select('strong'):
        lst_of_section_subheaders.append(subheader.text.replace(":",""))

list_of_section_values = []
for section in named_section_dict.keys():
    full_section_cleaned = re.sub("</a>","",re.sub("<a.*?>","",str(named_section_dict[section]))).replace("\n"," ")

    for val in re.findall("<\/strong>[\w\s\\.\-\,]+<br\/>",full_section_cleaned):
        list_of_section_values.append(clean_subheader_values(val))

plant_dict = {}
for head, val in list(zip(lst_of_section_subheaders,list_of_section_values)):
    plant_dict[head] = val

print(plant_dict)

{'Duration': ' Annual ', 'Habit': ' Herb ', 'Leaf Shape': ' Oblanceolate ', 'Leaf Pubescence': ' Glabrous ', 'Leaf Texture': ' Smooth ', 'Leaf': ' The upper or bracteal leaves usually red toward the base. ', 'Flower': ' Flowers 3-5 mm ', 'Fruit': ' 4.5-5 mm ', 'Size Class': ' 1-3 ft. ', 'Bloom Color': ' Yellow,Green ', 'Bloom Time': ' May,Jun,Jul,Aug,Sep,Oct,Nov ', 'Water Use': ' Medium ', 'Light Requirement': ' Sun ', 'Soil Moisture': ' Moist ', 'Soil Description': ' Sandy, Sandy Loam, Medium Loam, Clay Loam ', 'Use Ornamental': ' Accent, Garden. A good filler plant but can be invasive. ', 'Conspicuous Flowers': ' yes', 'Interesting Foliage': ' yes'}


Now grab and join data for all plants in sample

In [31]:
for symbol in sample_condensed["USDA Symbol"]:
    print(symbol)

PLCA7
PHRA3
FRAM2
ARDI8
POCO14
ASHU3
HETU
ACNI5
SIRO4
CATO4


In [32]:
labeled_plant_dicts = {}
list_of_unique_subheaders = []

for symbol in sample_condensed["USDA Symbol"]:

    full_url = BASE_URL + symbol
    response = requests.get(url = full_url, headers = HEADERS)
    soup = BeautifulSoup(response.text,features='lxml')

    all_sections = soup.find_all('div',class_="section")

    named_section_dict = {}
    for section in all_sections:
        section_name = section.select_one('h4').text
        if section_name in list_of_relevant_sections:
            named_section_dict[section_name] = section


    lst_of_section_subheaders = []
    for section in named_section_dict.keys():
        for subheader in  named_section_dict[section].select('strong'):
            subheader = subheader.text.replace(":","")
            if subheader not in list_of_unique_subheaders:
                list_of_unique_subheaders.append(subheader)
            lst_of_section_subheaders.append(subheader)

    list_of_section_values = []
    for section in named_section_dict.keys():
        full_section_cleaned = re.sub("</a>","",re.sub("<a.*?>","",str(named_section_dict[section]))).replace("\n"," ")

        for val in re.findall("<\/strong>[\w\s\\.\-\,]+<br\/>",full_section_cleaned):
            list_of_section_values.append(clean_subheader_values(val))

    plant_dict = {}
    for head, val in list(zip(lst_of_section_subheaders,list_of_section_values)):
        plant_dict[head] = val
    
    labeled_plant_dicts.update({symbol:plant_dict})



In [33]:
labeled_plant_dicts

{'PLCA7': {'Duration': ' Annual ',
  'Habit': ' Herb ',
  'Size Class': ' 3-6 ft. ',
  'Bloom Color': ' Pink,Purple ',
  'Bloom Time': ' Aug,Sep,Oct ',
  'Bloom Notes': ' Blooms year-round in South. '},
 'PHRA3': {'Duration': ' Annual ',
  'Habit': ' Herb ',
  'Fruit Type': ' Capsule ',
  'Size Notes': ' Up to about 10 inches tall. ',
  'Bloom Color': ' White,Violet ',
  'Bloom Time': ' Mar,Apr,May ',
  'Bloom Notes': ' Pale violet to white. '},
 'FRAM2': {'Duration': ' Perennial ',
  'Habit': ' Tree ',
  'Leaf Retention': ' Deciduous ',
  'Breeding System': ' Flowers Unisexual,Monoecious ',
  'Fruit Type': ' Samara ',
  'Size Notes': ' Up to about 120 feet tall. ',
  'Leaf': ' Green ',
  'Autumn Foliage': ' yes',
  'Fruit': ' Green ',
  'Bloom Color': ' Yellow,Purple ',
  'Bloom Time': ' Apr,May ',
  'Water Use': ' High ',
  'Light Requirement': ' Sun,Part Shade,Shade ',
  'Soil Moisture': ' Dry,Moist ',
  'Soil pH': ' Medium ',
  'CaCO3 Tolerance': ' yes',
  'Cold Tolerant': ' Deep, 

In [35]:
list_of_unique_subheaders

['Duration',
 'Habit',
 'Size Class',
 'Bloom Color',
 'Bloom Time',
 'Bloom Notes',
 'Fruit Type',
 'Size Notes',
 'Leaf Retention',
 'Breeding System',
 'Leaf',
 'Autumn Foliage',
 'Fruit',
 'Water Use',
 'Light Requirement',
 'Soil Moisture',
 'Soil pH',
 'CaCO3 Tolerance',
 'Cold Tolerant',
 'Soil Description',
 'Conditions Comments',
 'Use Ornamental',
 'Use Wildlife',
 'Use Other',
 'Attracts',
 'Larval Host',
 'Propagation Material',
 'Description',
 'Seed Treatment',
 'Commercially Avail',
 'Root Type',
 'Leaf Margin',
 'Flower',
 'Conspicuous Flowers',
 'Seed Collection',
 'Use Food',
 'Deer Resistant',
 'Leaf Arrangement',
 'Leaf Complexity',
 'Leaf Venation',
 'Inflorescence',
 'Leaf Pubescence',
 'Leaf Shape',
 'Leaf Apex',
 'Leaf Base']

In [37]:
for symbol,plant_dict in labeled_plant_dicts.items():
    for subheader in list_of_unique_subheaders:
        if subheader not in plant_dict.keys():
            plant_dict.update({subheader:np.nan})


In [38]:
for val in labeled_plant_dicts.values():
    print(len(val))

46
46
46
46
46
46
46
46
46
46


In [None]:
index = labeled_plant_dicts.keys()
print(index)

dict_keys(['PONO3', 'HEHI', 'HESU3', 'PYAL', 'SEPO2', 'PRTR', 'ROVI2', 'DAAL2', 'LOUN', 'CAFL3'])


In [None]:
from collections import defaultdict


In [None]:
dd = defaultdict(list)

for d in labeled_plant_dicts.values(): # you can list as many input dicts as you want here
    for key, value in d.items():
        dd[key].append(value)

In [None]:
pd.DataFrame(dd,index=index)

Unnamed: 0,Duration,Habit,Size Notes,Bloom Color,Bloom Time,The Xerces Society for Invertebrate Conservation,Root Type,Leaf Retention,Leaf Arrangement,Leaf Complexity,...,Leaf Pubescence,Inflorescence,Fruit,Conspicuous Flowers,Leaf,CaCO3 Tolerance,Use Food,Warning,Flower,Leaf Shape
PONO3,Annual,Herb,Up to about 3 feet tall.,Yellow,"Jun,Jul,Aug,Sep",,,,,,...,,,,,,,,,,
HEHI,Annual,Herb,,"Blue,Purple","May,Jun,Jul",,,,,,...,,,,,,,,,,
HESU3,Annual,Herb,,Yellow,"Jul,Aug,Sep,Oct,Nov",,,,,,...,,,,,,,,,,
PYAL,Perennial,Herb,yes,"Jul,Aug,Sep",Dense clusters of small white flowers with pu...,,Fibrous,Deciduous,Opposite,Simple,...,,,,,,,,,,
SEPO2,Perennial,Vine,"Stems trailing, much-branched, sometimes form...","Pink,Purple","Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec",,,,Opposite,,...,Glabrous,Axillary,"Seeds are black, smooth and lustrous",,,,,,,
PRTR,Perennial,Herb,,"White,Yellow","Aug,Sep,Oct",,,,,,...,,,,yes,,,,,,
ROVI2,Perennial,Shrub,,"Pink,Yellow,Purple","Jun,Jul,Aug",,,,,,...,,,Red,,Green,Low,Plant has thorns or prickles.,yes,,
DAAL2,Perennial,"Herb,Subshrub",White,,,,,,Alternate,,...,,,,,"Jun,Jul,Aug,Sep,Oct,Nov",,,,,
LOUN,Annual,Herb,,"White,Pink","Jun,Jul,Aug",,,,,,...,,,,,,,,,,
CAFL3,Perennial,Oblong,,"May,Jun",,,,,,,...,Spike,"White,Green",,,,,,,,Glabrous


Need to write code to handle if the plant isn't in the db

In [None]:
full_url = BASE_URL + "UUUU"
response = requests.get(url = full_url, headers = HEADERS)
print(response.status_code)

200


# Overview of data

In [4]:
with open("WildflowerColumns.txt","w") as f:
    for col in pd.read_csv("WildflowerFull.csv").columns:
        f.write(col + "\n")

# Join with source data - Test 

In [88]:
wildflower = pd.read_csv("WildflowerFull.csv",index_col=0)

In [89]:
full_data.columns

Index(['Plant Type', 'Scientific Name', 'Common Name', 'Plant Family',
       'USDA Symbol', 'Native Status', 'Distribution in USA', 'Workhorse',
       'Flower Color', 'Showy', 'Flowering Months', 'Height (feet)',
       'Lifespan', 'Growth Form', 'Shape and Orientation', 'Fall Conspicuous',
       'Leaf Retention', 'Sun Exposure', 'Soil Moisture', 'Moisture Use',
       'Soil Texture', 'Salt Tolerance', 'pH (Range)', 'Fertility Requirement',
       'Growth Rate', 'Active Growth Period', 'Hedge Tolerance',
       'Resprout Ability', 'Drought Tolerance', 'Fire Tolerance',
       'Palatability (Browsing/Grazing)', 'Propagation',
       'Commercially Available', 'Pollinator Value', 'Benefits To Pollinators',
       'Pollinators', 'Native Bees (except Bombus)', 'Bombus', 'Honey Bees',
       'Beetles, Wasps, Flies', 'Moths', 'Butterflies', 'Monarchs',
       'Nesting and Structure (Bees)', 'Larval Host (Monarch)',
       'Larval Host (Butterfly)', 'Larval Host (Moth)',
       'Larval Spec

In [90]:
wildflower.columns

Index(['Duration', 'Habit', 'Root Type', 'Leaf Arrangement', 'Leaf Shape',
       'Leaf Apex', 'Fruit Type', 'Size Notes', 'Bloom Color', 'Bloom Time',
       'Bloom Notes', 'Light Requirement', 'Soil Moisture', 'Leaf',
       'Water Use', 'CaCO3 Tolerance', 'Conspicuous Flowers', 'Leaf Retention',
       'Leaf Complexity', 'Leaf Margin', 'Autumn Foliage', 'Flower', 'Fruit',
       'Cold Tolerant', 'Soil Description', 'Conditions Comments',
       'Propagation Material', 'Description', 'Seed Collection',
       'Seed Treatment', 'Commercially Avail', 'Leaf Venation',
       'Leaf Pubescence', 'Leaf Base', 'Breeding System', 'Inflorescence',
       'Drought Tolerance', 'Use Ornamental', 'Use Wildlife', 'Use Food',
       'Use Other', 'Interesting Foliage', 'Larval Host', 'Deer Resistant',
       'Use Medicinal', 'Fragrant Foliage', 'Fragrant Flowers', 'Maintenance',
       'Nectar Source', 'Aquatic', 'Poisonous', 'Size Class', ' toxic ',
       'aggressive'],
      dtype='object')

flowering_months = bloom_time, 

In [91]:
wildflower.rename(columns={'Bloom Time':'Flowering Months','Bloom Color':'Flower Color'},inplace=True)

In [92]:
columns_to_update = list(set(wildflower.columns).intersection(full_data.columns))
print(columns_to_update)

['Soil Moisture', 'Drought Tolerance', 'Flower Color', 'Leaf Retention', 'Flowering Months']


In [93]:
wildflower = wildflower[columns_to_update]

In [94]:
temp = full_data[["USDA Symbol"] + columns_to_update][:100].set_index("USDA Symbol")


In [95]:
for symbol in wildflower.index:
    if symbol not in temp.index:
        wildflower.drop(symbol,inplace=True)

In [96]:
temp

Unnamed: 0_level_0,Soil Moisture,Drought Tolerance,Flower Color,Leaf Retention,Flowering Months
USDA Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACFA,Dry,,,,
ACRH,"Moist, Dry",,,,
ACVI,"Moist, Dry",Medium,Red,Deciduous,Jun–Jul
ACFL,"Wet, Moist",Low,Yellow,Deciduous,Apr–May
ACNE2,"Wet, Moist",High,White,Deciduous,Mar–Apr
...,...,...,...,...,...
ARPL4,"Wet, Moist, Dry",,,,
ARRE6,,,,,
ARPR2,,,,,
ARAR7,"Wet, Moist",Low,White,Deciduous,Mar–May


In [97]:
wildflower

Unnamed: 0,Soil Moisture,Drought Tolerance,Flower Color,Leaf Retention,Flowering Months
ACRH,"Dry,Moist",,Green,,"Jun,Jul,Aug,Sep,Oct,Nov"
ACVI,,,"Red,Yellow,Green",,"Jun,Jul,Aug,Sep,Oct,Nov"
ACFL,Dry,,"Yellow,Green",Deciduous,Apr
ACNE2,Moist,High,"Yellow,Green,Brown",Deciduous,"Mar,Apr"
ACNI5,Moist,,"Yellow,Green",Deciduous,"Apr,May"
...,...,...,...,...,...
ARPL4,Moist,,"White,Green",,"May,Jun,Jul,Aug"
ARRE6,,,"White,Green",,"May,Jun,Jul,Aug,Sep"
ARPR2,,,"White,Pink",,
ARAR7,Moist,,"White,Pink",Deciduous,"Feb,Mar,Apr,May"


In [98]:
dict_wildflower = wildflower.to_dict()
dict_temp = temp.to_dict()

In [99]:
for col in columns_to_update:
    for symbol in temp.index:
        if dict_temp[col][symbol] is np.nan:
            if (symbol in wildflower.index) and (dict_wildflower[col][symbol] is not np.nan):
                print("Replaced")
                dict_temp[col][symbol] = dict_wildflower[col][symbol]


Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced
Replaced


In [100]:
temp = pd.DataFrame(dict_temp)

In [101]:
temp.isna().sum()

Soil Moisture        11
Drought Tolerance    52
Flower Color          1
Leaf Retention       49
Flowering Months      2
dtype: int64

# Join - Implemenation

In [107]:
wildflower = pd.read_csv("WildflowerFullOriginal.csv",index_col=0)
era = full_data.set_index("USDA Symbol")

In [108]:
wildflower.rename(columns={'Bloom Time':'Flowering Months','Bloom Color':'Flower Color'},inplace=True)
columns_to_update = list(set(wildflower.columns).intersection(era.columns))


In [109]:
wildflower = wildflower[columns_to_update]
era_condensed = era[columns_to_update]

In [110]:
era_condensed.isna().sum()

Soil Moisture        293
Drought Tolerance    974
Flower Color         749
Leaf Retention       972
Flowering Months     697
dtype: int64

In [111]:
for symbol in wildflower.index:
    if symbol not in era_condensed.index:
        wildflower.drop(symbol,inplace=True)

In [112]:
dict_wildflower = wildflower.to_dict()
dict_era_condensed = era_condensed.to_dict()

In [113]:
counter = 0
for col in columns_to_update:
    for symbol in era_condensed.index:
        if dict_era_condensed[col][symbol] is np.nan:
            if (symbol in wildflower.index) and (dict_wildflower[col][symbol] is not np.nan):
                counter += 1
                dict_era_condensed[col][symbol] = dict_wildflower[col][symbol]
print(counter)

1558


In [115]:
df = pd.DataFrame(dict_era_condensed)
df.isna().sum()

Soil Moisture        212
Drought Tolerance    932
Flower Color          29
Leaf Retention       861
Flowering Months      93
dtype: int64