# Spacy over AWS products (colab ready)

Inspiration:  
https://github.com/explosion/spacy-notebooks  
https://spacy.io/models/en  
https://spacy.io/usage/visualizers#jupyter  
https://spacy.io/usage/models  
https://spacy.io/usage/facts-figures#spacy-models  
https://spacy.io/usage/spacy-101  

In [None]:
!which python

In [None]:
!pip install spacy pandas cloud-products>=1.1.0

In [None]:
# NOTE: You may need to restart the kernel to be able to use these downloaded models if installed here
# !python -m spacy download en_core_web_lg  # v2.3.1 782.7 MB
!python -m spacy download en_core_web_md  # v2.3.1 50.8 MB

In [None]:
!pip list | grep spacy

In [None]:
import pandas as pd
import spacy
from spacy import displacy
from cloud_products.aws import AwsCrawler
from IPython.core.display import HTML

In [None]:
def get_cloud_products(use_cache=True):
    crawler = AwsCrawler()
    products = crawler.get_products(use_cache=use_cache)
    product_text = {}
    for product in products:
        product_lines = crawler.get_product_text(product, use_cache=use_cache)
        product_text[product.code] = " ".join(product_lines)
    df = pd.DataFrame.from_records([vars(p) for p in products])
    df['product_text'] = df["code"].apply(lambda code: product_text[code])
    return df

In [None]:
df_cloud_products = get_cloud_products()
df_cloud_products.head(2)

In [None]:
codes = df_cloud_products["code"].unique()
print(f"len(codes) = {len(codes)}")
codes[:10]

In [None]:
code = "sagemaker"
text = df_cloud_products.set_index("code").loc[code]["product_text"]
text[:120]

In [None]:
nlps = {}
nlps["en_core_web_md"] = spacy.load("en_core_web_md")

In [None]:
def apply_nlp(text, nlps, nlp_type, options_ent={}, render=True) -> pd.DataFrame:
    nlp = nlps[nlp_type]
    doc = nlp(text)
    
    if render:
        doc.user_data["title"] = f"{nlp_type}"
        #options_dep = {"compact": False, "bg": "#ffffff", "color": "grey", "font": "Source Sans Pro"}
        #displacy.render(doc, style="dep", options=options_dep)
        #displacy.render(list(doc.sents), style="dep", options=options_dep)
        displacy.render(doc, style="ent", jupyter=True, options=options_ent)
        
    ents = []
    for e in doc.ents:
        # print(ent.text, ent.label_)
        ents.append((e.label_, e.text, e.root, e.conjuncts, e.start_char, e.end_char, e.vector_norm))
    df = pd.DataFrame.from_records(ents)
    columns = ["label", "text", "root", "conjuncts", "start_char", "end_char", "vector_norm"]
    df.columns = columns
    df.sort_values(by="label", inplace=True)
    df = df.groupby(columns).size().reset_index(name='counts')
    return df

In [None]:
df = apply_nlp(text, nlps, "en_core_web_md")

In [None]:
HTML(df.to_html())