# Purpose.
This notebook allows me to works through the different aspects of webscrapping using the requests library and parsing through that information using Beautiful Soup 4 in order to collect information on plants native to my region. The next phase of this project is to use SQLite to upload that data into a database.

Version 1 used wildflower.org in order to find the name of plants native to Virginia that are comercially available. Those names were then used to scrape Wikipedia for limited information. Wildflower.org was a little too inconsistent for me to build a routine to webscrape.

Version 2 looks at https://www.nativeplantcenter.net/ by first going through the search results for plants native to my specific region of the Chesapeake Bay and storing the hyperlinks used to access more information about the plants. Those links are then used to get all the information the website displays about the plants which is stored as a list of lists, with each index representing a different plant. 

In [1]:
import requests
import bs4

In [2]:
# Go through the search results for plants native to my region and grab the links to websites specifc to those plants
# npc stands for native plant center, the name of the website
npc_links = []
for i in range(27):
    url = 'https://www.nativeplantcenter.net/plants/page/'+str(i+1)+'/?s&regions%5B0%5D=coastal-plain#038;regions%5B0%5D=coastal-plain'
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,"lxml")
    
    # get the links attached to the search results
    hrefs = soup.select('body')[0].find_all('div',class_="row plants-images")[0].find_all("a", href=True)
    for link in hrefs:
        npc_links.append(link['href'])

In [3]:
# We will create a dictionary of all the information found on the links. This does two things
# 1) Organize the data how we see fit
# 2) Circumvent problems that arrise from inconsistent information categories. Some plants include a section about
#    "Ground Cover" while also omitting "Flower color" because those plants don't flower.

# The following is a constant based on all possible sections of information.

CATEGORIES = ['Binomial','Common Names','Family','Regions','States','Plant Types','Height','Flower Color','Fruit',
              'Sun Exposure', 'Soil Texture','Soil Moisture','Blooms','Fall Color','Ground Cover','Habitat','Notes']

In [4]:
plant_info = []


for url in npc_links:
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,"lxml")
    
    #binomial name
    binomial_name = soup.find('body').find_all('h1')[1].get_text()
    
    #common names
    common_names = soup.find('body').select('strong')[0].get_text()
    
    #all the other info
    body_info = soup.find('body').find('div', class_ = 'row nomarginrow').select('p')
    cleaned_body_info = []
    for entry in body_info:
        if len(entry.get_text()) > 0:
            cleaned_body_info.append(entry.get_text())
    
    #create a dictionary of all the information
    plant_dict = {"Binomial" : binomial_name, "Common Names" : common_names}
    for i, entry in enumerate(cleaned_body_info):
        try:
            titles = cleaned_body_info[i][:cleaned_body_info[i].index(':')]    
            info = cleaned_body_info[i][cleaned_body_info[i].index(':')+2:]
            plant_dict.update({titles:info})
        except:
            #sometimes sections of the information we want is empty, this filters them out
            pass
        
    # List of the information we want in the order we want
    info_list = []
    for i,entry in enumerate(CATEGORIES):
        try:
            info_list.append(plant_dict[entry])
        except:
            info_list.append('NA')
        

    #now putting it all in the list we want
    plant_info.extend([info_list])

In [None]:
# Data validation
# The length of plant_info should be 319. There are 12 results for each of he 26 pages and 7 on the last page.

len(plant_info)

In [12]:
# Append the categories to the beginning of the plant_info list in order to know what all the indices mean
plant_info.insert(0, CATEGORIES)

[['Binomial',
  'Common Names',
  'Family',
  'Regions',
  'States',
  'Plant Types',
  'Height',
  'Flower Color',
  'Fruit',
  'Sun Exposure',
  'Soil Texture',
  'Soil Moisture',
  'Blooms',
  'Fall Color',
  'Ground Cover',
  'Habitat',
  'Notes'],
 ['Binomial',
  'Common Names',
  'Family',
  'Regions',
  'States',
  'Plant Types',
  'Height',
  'Flower Color',
  'Fruit',
  'Sun Exposure',
  'Soil Texture',
  'Soil Moisture',
  'Blooms',
  'Fall Color',
  'Ground Cover',
  'Habitat',
  'Notes'],
 ['Acer negundo',
  ' box elder, ash leaf maple, Manitoba maple',
  'Aceraceae',
  'Coastal Plain, Mountain, Piedmont',
  'DE, DC, MD, NY, PA, VA, WV',
  'Small/Medium Tree (Understory)',
  '30 - 60ft; Spread: 30 - 60ft',
  'yellow-green',
  'Winged;          tan brown',
  'Full Sun, Partial Sun',
  'Clay, Loamy, Sandy',
  'Moist, Wet',
  'April -          May',
  'yellow, red',
  'NA',
  'grows best in lowland sites along rivers, streams, ponds, and seasonally flooded areas; tolerates a v

In [20]:
# Data Validation
# quickly scanning the list to find any potential abnormalities.
plant_info

[['Binomial',
  'Common Names',
  'Family',
  'Regions',
  'States',
  'Plant Types',
  'Height',
  'Flower Color',
  'Fruit',
  'Sun Exposure',
  'Soil Texture',
  'Soil Moisture',
  'Blooms',
  'Fall Color',
  'Ground Cover',
  'Habitat',
  'Notes'],
 ['Acer negundo',
  ' box elder, ash leaf maple, Manitoba maple',
  'Aceraceae',
  'Coastal Plain, Mountain, Piedmont',
  'DE, DC, MD, NY, PA, VA, WV',
  'Small/Medium Tree (Understory)',
  '30 - 60ft; Spread: 30 - 60ft',
  'yellow-green',
  'Winged;          tan brown',
  'Full Sun, Partial Sun',
  'Clay, Loamy, Sandy',
  'Moist, Wet',
  'April -          May',
  'yellow, red',
  'NA',
  'grows best in lowland sites along rivers, streams, ponds, and seasonally flooded areas; tolerates a variety of soil types',
  'brittle wood; thicket-forming'],
 ['Acer rubrum',
  ' red maple, scarlet maple, swamp maple, soft maple',
  'Aceraceae',
  'Coastal Plain, Mountain, Piedmont',
  'DE, DC, MD, NY, PA, VA, WV',
  'Tall Tree (Canopy)',
  '40 - 100

In [17]:
# Store the list as a file for now
import json

with open("plant_infofile_v2.txt","w") as f:
    json.dump(plant_info, f)

In [18]:
# Can check out the file created above by running the following

with open("plant_infofile_v2.txt","r") as f:
    plant_list = json.load(f)
    
plant_list

[['Binomial',
  'Common Names',
  'Family',
  'Regions',
  'States',
  'Plant Types',
  'Height',
  'Flower Color',
  'Fruit',
  'Sun Exposure',
  'Soil Texture',
  'Soil Moisture',
  'Blooms',
  'Fall Color',
  'Ground Cover',
  'Habitat',
  'Notes'],
 ['Acer negundo',
  ' box elder, ash leaf maple, Manitoba maple',
  'Aceraceae',
  'Coastal Plain, Mountain, Piedmont',
  'DE, DC, MD, NY, PA, VA, WV',
  'Small/Medium Tree (Understory)',
  '30 - 60ft; Spread: 30 - 60ft',
  'yellow-green',
  'Winged;          tan brown',
  'Full Sun, Partial Sun',
  'Clay, Loamy, Sandy',
  'Moist, Wet',
  'April -          May',
  'yellow, red',
  'NA',
  'grows best in lowland sites along rivers, streams, ponds, and seasonally flooded areas; tolerates a variety of soil types',
  'brittle wood; thicket-forming'],
 ['Acer rubrum',
  ' red maple, scarlet maple, swamp maple, soft maple',
  'Aceraceae',
  'Coastal Plain, Mountain, Piedmont',
  'DE, DC, MD, NY, PA, VA, WV',
  'Tall Tree (Canopy)',
  '40 - 100