# Imports and Data

In [3]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup

import numpy as np

In [3]:
BASE_URL = "https://plants.ces.ncsu.edu/plants/"
HEADERS = {
    "Accept-Language": "en-US,en;q=0.9",
    "User-Agent": "Chrome/87.0.4280.141"
}

In [4]:
source_data = pd.read_excel("../../Data/ERA_Alabama.xlsx",sheet_name="All Plants")

# Scrape one plant

In [5]:
# <ul class="list-group brick">
#      <li class="list-group-item">
#          <dl>
#              <span class="group_name ">Attributes:</span>
             
#                  <dt>Genus:</dt>
                 
                     
#                          <dd><span class="detail_display_attribute">Camellia</span>

In [6]:
# Get all ul

full_url = BASE_URL + "camellia-sasanqua"
response = requests.get(url = full_url, headers = HEADERS)
page = response.text 
soup = BeautifulSoup(page,features='lxml')


In [7]:
for group in soup.select("span.group_name"):
    print(group.text)

Attributes:
Whole Plant Traits:
Cultural Conditions:
Fruit:
Flowers:
Leaves:
Stem:
Landscape:


In [8]:
for definition_list in soup.select("li.list-group-item dl"):
    text_list = definition_list.text.split("\n")
    text_list.pop(0)
    # print(text_list)
    test_list = text_list

In [9]:
test_list

['Landscape:',
 'Landscape Location:',
 'Container',
 'Recreational Play Area',
 'Landscape Theme:',
 'Asian Garden',
 "Children's Garden",
 'Cottage Garden',
 'Drought Tolerant Garden',
 'English Garden',
 'Garden for the Blind',
 'Pollinator Garden',
 'Winter Garden',
 'Design Feature:',
 'Accent',
 'Border',
 'Flowering Tree',
 'Foundation Planting',
 'Hedge',
 'Screen/Privacy',
 'Small Tree',
 'Attracts:',
 'Pollinators',
 'Songbirds',
 'Resistance To Challenges:',
 'Deer',
 'Drought',
 'Salt',
 'Problems:',
 'Frequent Disease Problems',
 'Frequent Insect Problems',
 '']

In [10]:
def create_dict(lst):

    '''This function will iterate through the list and create a new key-value pair in the dictionary
    whenever it encounters an element with a colon in it. This excludes the first element which will be 
    the outer key of the dictionary. 
    
    The value for each key will be a list of all the
    elements after the key element until the next element with a colon.'''
    
    #Set outer key as the group name (i.e. "Attributes")
    group_name = lst[0]
    result = {group_name:{}}
    

    current_key = None
    current_value = []
    for item in lst:
        if item == group_name:
            continue
        
        if ':' in item:

            #Edge Cases
            
            if item.startswith("Height:") | item.startswith("Width:"):
                current_value.append(item)
                continue

            #Set the previous key's values to the current list of values
            if current_key:
                result[group_name][current_key] = current_value
            
            #Reset key and value list
            current_key = item
            current_value = []
        else:
            
            #THIS COULD CAUSE PROBLEMS
            if item != "":
                current_value.append(item)

    result[group_name][current_key] = current_value

    #Join the list into a string 
    for key in result[group_name].keys():
        result[group_name][key] = "|".join(result[group_name][key])

    # result[group_name] = {key.replace(":",""): value for key, value in result[group_name].items()}


    return result

plant_dict = create_dict(test_list)



In [11]:
for definition_list in soup.select("li.list-group-item dl"):
    text_list = definition_list.text.split("\n")
    text_list.pop(0)
    plant_dict = create_dict(text_list)
    print(plant_dict)

{'Attributes:': {'Genus:': 'Camellia', 'Species:': 'sasanqua', 'Family:': 'Theaceae', 'Life Cycle:': 'Woody', 'Country Or Region Of Origin:': 'Japan', 'Play Value:': 'Attractive Flowers|Attracts Pollinators|Fragrance|Wildlife Cover/Habitat|Wildlife Food Source', 'Particularly Resistant To (Insects/Diseases/Other Problems):': 'Resistant to Phytophthora root rot.', 'Dimensions:': 'Height: 6 ft. 0 in. - 14 ft. 0 in.|Width: 5 ft. 0 in. - 7 ft. 0 in.'}}
{'Whole Plant Traits:': {'Plant Type:': 'Shrub|Tree', 'Woody Plant Leaf Characteristics:': 'Broadleaf Evergreen', 'Habit/Form:': 'Arching|Erect|Open|Oval|Pyramidal', 'Growth Rate:': 'Rapid', 'Maintenance:': 'Low|Medium', 'Texture:': 'Medium'}}
{'Cultural Conditions:': {'Light:': 'Full sun (6 or more hours of direct sunlight a day)|Partial Shade (Direct sunlight only part of the day, 2-6 hours)', 'Soil Texture:': 'Clay|High Organic Matter|Loam (Silt)|Sand', 'Soil pH:': 'Acid (<6.0)|Neutral (6.0-8.0)', 'Soil Drainage:': 'Good Drainage|Moist|Oc

In [40]:
all_groups = soup.select("li.list-group-item dl")
plant_dicts = [create_dict(x) for x in [group.text.split("\n")[1:] for group in all_groups]]

full_plant_dict = {"Camellia sasanqua":{}}
for plant_dict in plant_dicts:
    for kv_dict in plant_dict.values():
        full_plant_dict["Camellia sasanqua"].update(kv_dict)
print(full_plant_dict)

{'Camellia sasanqua': {'Genus:': 'Camellia', 'Species:': 'sasanqua', 'Family:': 'Theaceae', 'Life Cycle:': 'Woody', 'Country Or Region Of Origin:': 'Japan', 'Play Value:': 'Attractive Flowers|Attracts Pollinators|Fragrance|Wildlife Cover/Habitat|Wildlife Food Source', 'Particularly Resistant To (Insects/Diseases/Other Problems):': 'Resistant to Phytophthora root rot.', 'Dimensions:': 'Height: 6 ft. 0 in. - 14 ft. 0 in.|Width: 5 ft. 0 in. - 7 ft. 0 in.', 'Plant Type:': 'Shrub|Tree', 'Woody Plant Leaf Characteristics:': 'Broadleaf Evergreen', 'Habit/Form:': 'Arching|Erect|Open|Oval|Pyramidal', 'Growth Rate:': 'Rapid', 'Maintenance:': 'Low|Medium', 'Texture:': 'Medium', 'Light:': 'Full sun (6 or more hours of direct sunlight a day)|Partial Shade (Direct sunlight only part of the day, 2-6 hours)', 'Soil Texture:': 'Clay|High Organic Matter|Loam (Silt)|Sand', 'Soil pH:': 'Acid (<6.0)|Neutral (6.0-8.0)', 'Soil Drainage:': 'Good Drainage|Moist|Occasionally Dry|Occasionally Wet', 'Available Sp

# Take approach on sample of 5

In [77]:
names = ["-".join(name.split()).lower() for name in source_data["Scientific Name"]]
sample =names[:20]
print(sample)

['acacia-farnesiana', 'acalypha-rhomboidea', 'acalypha-virginica', 'acer-floridanum', 'acer-negundo', 'acer-nigrum', 'acer-rubrum', 'acer-saccharinum', 'acer-saccharum', 'acer-spicatum', 'achillea-millefolium', 'acmella-oppositifolia', 'aconitum-uncinatum', 'actaea-pachypoda', 'actaea-racemosa', 'adiantum-capillus-veneris', 'adiantum-pedatum', 'aeschynomene-americana', 'aeschynomene-viscidula', 'aesculus-flava']


In [1]:
all_full_plant_dicts = []

for name in sample:
    full_url = BASE_URL + name
    response = requests.get(url = full_url, headers = HEADERS)
    page = response.text 
    soup = BeautifulSoup(page,features='lxml')

    all_groups = soup.select("li.list-group-item dl")

    if len(all_groups) == 0:
        continue

    plant_dicts = [create_dict(x) for x in [group.text.split("\n")[1:] for group in all_groups]]


    
    full_plant_dict = {name:{}}
    for plant_dict in plant_dicts:
        for kv_dict in plant_dict.values():
            full_plant_dict[name].update(kv_dict)

    print(len(full_plant_dict[name].keys()))

    all_full_plant_dicts.append(full_plant_dict)

    with open("Backup.txt",'w') as f:
        json_string = json.dumps([df.to_dict() for df in dfs])
        f.write(json_string)



list_of_dfs = None
for full_plant_dict in all_full_plant_dicts:
    df = pd.DataFrame.from_dict(full_plant_dict, orient='index')
    if list_of_dfs:
        list_of_dfs.append(df)
    else:
        list_of_dfs = [df]

full_df = pd.concat(list_of_dfs)
full_df.to_csv("Test2.csv")


In [81]:
print(len(full_df.columns))

86


# Run on all

# Multi Index Columns

Want to find a way to maintain groupings using Multi-Index. 

Note: This code is left in as a reference for later. To make this work, you can only append a tuple if the key is in the columns of a df = pd.read_csv("NCSU_Formatted.csv")

In [30]:
df = pd.DataFrame.from_dict(full_plant_dict, orient='index') #This can be a df read in from NCSU_unedited

In [31]:
print(df.iloc[:,:4])

                     Genus:  Species:   Family: Life Cycle:
Camellia sasanqua  Camellia  sasanqua  Theaceae       Woody


In [42]:
full_plant_dict_by_groups = {"Camellia sasanqua":{}}
for plant_dict in plant_dicts:
    full_plant_dict_by_groups["Camellia sasanqua"].update(plant_dict)


temp = full_plant_dict_by_groups["Camellia sasanqua"]

multi_index_tuples = []

for outer_key, outer_value in temp.items():
    for inner_key,inner_value in outer_value.items():
        if inner_value != "":
            multi_index_tuples.append((outer_key,inner_key))

print(len(multi_index_tuples))



54


In [58]:
seen = []
for pair in multi_index_tuples:
    if pair[1] in seen:
        print(pair)
        multi_index_tuples.remove(pair)
    seen.append(pair[1])
print(seen)

('Whole Plant Traits:', 'Woody Plant Leaf Characteristics:')
['Flower Color:', 'Flower Inflorescence:', 'Flower Value To Gardener:', 'Flower Bloom Time:', 'Flower Shape:', 'Flower Petals:', 'Flower Size:', 'Flower Description:', 'Landscape Location:', 'Landscape Theme:', 'Design Feature:', 'Attracts:', 'Resistance To Challenges:', 'Problems:', 'Display/Harvest Time:', 'Fruit Type:', 'Fruit Length:', 'Fruit Description:', 'Genus:', 'Species:', 'Family:', 'Life Cycle:', 'Country Or Region Of Origin:', 'Play Value:', 'Particularly Resistant To (Insects/Diseases/Other Problems):', 'Dimensions:', 'Woody Plant Leaf Characteristics:', 'Leaf Color:', 'Leaf Feel:', 'Leaf Value To Gardener:', 'Leaf Type:', 'Leaf Arrangement:', 'Leaf Shape:', 'Leaf Margin:', 'Hairs Present:', 'Leaf Length:', 'Leaf Width:', 'Leaf Description:', 'Light:', 'Soil Texture:', 'Soil pH:', 'Soil Drainage:', 'Available Space To Plant:', 'NC Region:', 'USDA Plant Hardiness Zone:', 'Stem Color:', 'Stem Is Aromatic:', 'Stem 

In [59]:
print(len(multi_index_tuples))

53


In [60]:
df.columns = pd.MultiIndex.from_tuples(multi_index_tuples)

In [27]:
print(df.iloc[:,:4])

                  Attributes:                                
                       Genus:  Species:   Family: Life Cycle:
Camellia sasanqua    Camellia  sasanqua  Theaceae       Woody


In [None]:
with open("GC_pairings.txt","w") as f:
    for tup in multi_index_tuples:
        f.write(str(tup) + "\n")