In [68]:
import pandas as pd
import os

#The source database
era_pa = pd.read_excel(r"C:\Users\CPL17\OneDrive\Desktop\PA Wildflower Database 10_27.xlsx",sheet_name="ERA_PA")
era_pa = era_pa[["Scientific Name","Common Name","USDA Symbol"]]

for col in era_pa:
    era_pa[col] = era_pa[col].str.lower()


count_dict = {}
list_of_dfs = []

#For each file -> read file into string,check common and scientific names for each plant against the string. Using a copy of the 
# source dataframe, remove all non-matched values,  and append to a list

for file in os.listdir("./Data/TextFiles"):

    full_path = "./Data/TextFiles/" + file

    with open(full_path,encoding='unicode_escape') as f:
        string = f.read()
    
    df = era_pa.copy()
    df["Match"] = 0

    com_names = df["Common Name"].to_list()
    scientific_names = df["Scientific Name"].to_list()


    #Check the string for the common name and scientific name. 
    # If there is a match, replace match with a 1
    for i,tup in enumerate(list(zip(com_names,scientific_names))):
        common_name = tup[0]
        scientific_name = tup[1]
        if ((common_name in string) | (scientific_name in string)):
            df.loc[i,"Match"] = 1

    #Create a nursery column
    nursery = file.split(".")[0]
    df["Nursery"] = nursery
    
    #Reduce the df to only where there is a match
    df = df[df.Match == 1]

    #Find the number of matches and append to the count dict
    count = df.Match.sum()
    count_dict.update({nursery:count})

    df.drop("Match",axis=1,inplace=True)
    list_of_dfs.append(df)

In [69]:
#A long df with entries SYMBOL Catalog URLS. Note: URLS are the Catalog URLSs NOT the 
# urls used for the http requests. 

In [70]:
nursery_matches = pd.concat(list_of_dfs)
nursery_matches.columns = ["Scientific Name","Common Name","USDA_Symbol","SOURCE"]
nursery_matches["USDA_Symbol"] = nursery_matches["USDA_Symbol"].str.upper()
nursery_matches["Common Name"] = nursery_matches["Common Name"].str.title()
nursery_matches["Scientific Name"] = nursery_matches["Scientific Name"].str.title()
nursery_matches.head()

Unnamed: 0,Scientific Name,Common Name,USDA_Symbol,SOURCE
8,Acorus Americanus,Sweetflag,ACAM,Arcadia Natives Washington
13,Adiantum Pedatum,Northern Maidenhair,ADPE,Arcadia Natives Washington
20,Agastache Foeniculum,Blue Giant Hyssop,AGFO,Arcadia Natives Washington
21,Agastache Nepetoides,Yellow Giant Hyssop,AGNE2,Arcadia Natives Washington
22,Agastache Scrophulariifolia,Purple Giant Hyssop,AGSC,Arcadia Natives Washington


In [71]:
nursery_info = pd.read_excel("./Data/Local_Catalog_URLs.xlsx",sheet_name="Sheet1")
nursery_info = nursery_info[["Nursery","Root_URL"]].drop_duplicates(subset="Nursery")
mapping = nursery_info.set_index("Nursery").to_dict()["Root_URL"]
for k,v in mapping.items():
    print(k,v)

Arcadia Natives Washington https://arcadianatives.com/
Archewild Quakertown https://archewild.com/
Butterfly Oasis Lancaster http://www.butterflyoasis.net/
David Brothers Norristown https://www.davidbrothers.com
Diakon Boiling Springs https://www.diakon.org/wilderness-greenhouse/
Edge of the Woods Orefield https://edgeofthewoodsnursery.com/
Gino's Newtown https://www.ginosnursery.com/
Go Native Trees https://www.gonativetrees.com/
Good Host Plants Philly https://www.goodhostplants.com/
Hungry Hook Bainbridge http://www.hungryhookfarm.com/
Keystone Robesonia https://www.keystonewildflowers.com/
Meadowsweet Native York https://www.meadowsweetnative.com
Musser Indiana https://www.musserforests.com/
Northbrook Natives West Chester https://www.northbrooknatives.com/
Perennial New Bloomfield http://www.perennialgardens.name/contact
Heartwood Felton https://www.heartwoodnurseryinc.com
Calyx Native Nursery https://calyxnativenursery.com/
Bruce Nursery https://brucenurseries.com/native-plants/


In [72]:
nursery_matches["SOURCE_URL"] = nursery_matches["SOURCE"].map(mapping)

In [73]:
nursery_matches.head()

Unnamed: 0,Scientific Name,Common Name,USDA_Symbol,SOURCE,SOURCE_URL
8,Acorus Americanus,Sweetflag,ACAM,Arcadia Natives Washington,https://arcadianatives.com/
13,Adiantum Pedatum,Northern Maidenhair,ADPE,Arcadia Natives Washington,https://arcadianatives.com/
20,Agastache Foeniculum,Blue Giant Hyssop,AGFO,Arcadia Natives Washington,https://arcadianatives.com/
21,Agastache Nepetoides,Yellow Giant Hyssop,AGNE2,Arcadia Natives Washington,https://arcadianatives.com/
22,Agastache Scrophulariifolia,Purple Giant Hyssop,AGSC,Arcadia Natives Washington,https://arcadianatives.com/


In [74]:
df = nursery_matches.drop(["Scientific Name","Common Name"],axis=1)
df.columns = ["USDA_SYMBOL","SOURCE","SOURCE_URL"]
df.to_csv("./Data/Local_Long.csv",index=False)

In [75]:
#Aggegrate along symbol to get SYMBOL URLS COUNT df. 

In [122]:
f = lambda x: ', '.join(map(str, set(x)))
local_agg = nursery_matches.groupby("USDA_Symbol").agg({"SOURCE_URL":[f,len],"SOURCE":f})

In [123]:
local_agg.reset_index(inplace=True)
local_agg.columns = ["USDA_Symbol","SOURCE_URLS","COUNT","SOURCE"]

In [124]:
local_agg

Unnamed: 0,USDA_Symbol,SOURCE_URLS,COUNT,SOURCE
0,ABBA,"http://www.perennialgardens.name/contact, http...",2,"Musser Indiana, Perennial New Bloomfield"
1,ACAM,"https://arcadianatives.com/, http://foxhillgar...",2,"Fox Hill Gardens, Arcadia Natives Washington"
2,ACMI2,"https://www.ginosnursery.com/, https://www.tai...",3,"Fox Hill Gardens, Gino's Newtown, Tait Farm"
3,ACNE2,"https://www.ginosnursery.com/, https://archewi...",3,"Gino's Newtown, Archewild Quakertown, Heartwoo..."
4,ACPA,"https://www.ginosnursery.com/, https://www.key...",3,"Hungry Hook Bainbridge, Gino's Newtown, Keysto..."
...,...,...,...,...
545,WIFR,"https://arcadianatives.com/, https://redbudnat...",3,"Gino's Newtown, Redbud Media, Arcadia Natives ..."
546,YUFI,http://foxhillgardens.com/native-perennials/,1,Fox Hill Gardens
547,ZAAM,https://www.davidbrothers.com,1,David Brothers Norristown
548,ZIAP,"https://arcadianatives.com/, https://www.diako...",4,"Diakon Boiling Springs, Edge of the Woods Oref..."


In [125]:
# local_agg.reset_index(inplace=True)

local_agg = pd.merge(local_agg,nursery_matches[["USDA_Symbol","Scientific Name","Common Name"]].drop_duplicates(),how="left",on="USDA_Symbol")

In [126]:
local_agg.head()

Unnamed: 0,USDA_Symbol,SOURCE_URLS,COUNT,SOURCE,Scientific Name,Common Name
0,ABBA,"http://www.perennialgardens.name/contact, http...",2,"Musser Indiana, Perennial New Bloomfield",Abies Balsamea,Balsam Fir
1,ACAM,"https://arcadianatives.com/, http://foxhillgar...",2,"Fox Hill Gardens, Arcadia Natives Washington",Acorus Americanus,Sweetflag
2,ACMI2,"https://www.ginosnursery.com/, https://www.tai...",3,"Fox Hill Gardens, Gino's Newtown, Tait Farm",Achillea Millefolium,Common Yarrow
3,ACNE2,"https://www.ginosnursery.com/, https://archewi...",3,"Gino's Newtown, Archewild Quakertown, Heartwoo...",Acer Negundo,Boxelder
4,ACPA,"https://www.ginosnursery.com/, https://www.key...",3,"Hungry Hook Bainbridge, Gino's Newtown, Keysto...",Actaea Pachypoda,White Baneberry
...,...,...,...,...,...,...
545,WIFR,"https://arcadianatives.com/, https://redbudnat...",3,"Gino's Newtown, Redbud Media, Arcadia Natives ...",Wisteria Frutescens,American Wisteria
546,YUFI,http://foxhillgardens.com/native-perennials/,1,Fox Hill Gardens,Yucca Filamentosa,Adam'S Needle
547,ZAAM,https://www.davidbrothers.com,1,David Brothers Norristown,Zanthoxylum Americanum,Common Pricklyash
548,ZIAP,"https://arcadianatives.com/, https://www.diako...",4,"Diakon Boiling Springs, Edge of the Woods Oref...",Zizia Aptera,Meadow Zizia


In [127]:
local_agg["String"] = local_agg["Scientific Name"] + " (" + local_agg["Common Name"] + "): " + local_agg["SOURCE"]
local_agg.to_csv("./Data/Local_Agg.csv")