# Integration of the [linux foundation data and AI landscape](https://landscape.lfai.foundation/) data source

This notebook loads the "landscape.yml" from [lfai-landscape](https://github.com/lfai/lfai-landscape/blob/main/landscape.yml) and extracts all listed tools and tool information. This also includes the tool URL, repository URL and a tool category and subcategory. 

## Imports

In [None]:
import yaml
import requests
import pandas as pd
import re

## Raw Stage - Load raw data source from the GitHub repository

A specific git hash was used to ensure reproducibility.

In [None]:
RAW_DATA_SOURCE_URL = "https://github.com/lfai/lfai-landscape/raw/57d5eb8ada675d545fc8096114eacce65d7cece4/landscape.yml"

In [None]:
response = requests.get(RAW_DATA_SOURCE_URL)
if response.status_code == 200:
    lfai_lanscape_yml_text= response.text

In [None]:
lfai_landscape=yaml.safe_load(lfai_lanscape_yml_text)

# Process data

### Extract relevant tool data from raw source and convert into a pandas DataFrame

In [None]:
# dict of dict to flat table
pd_list=list()
for category in lfai_landscape["landscape"]:
    category_name = category["name"]

    for subcategory in category["subcategories"]:
        subcategory_name=subcategory["name"]
        data_sucategory_nomralized =pd.json_normalize( subcategory,record_path="items")
        data_sucategory_nomralized["category"]=category_name
        data_sucategory_nomralized["subcategory"]=subcategory_name
        pd_list.append(data_sucategory_nomralized)

In [None]:
df_normalized= pd.concat(pd_list).reset_index(drop=True)

In [None]:
df_normalized.shape

### Save result of raw stage

In [None]:
df_normalized.to_csv("data/01_raw/lfai_landscape.csv",index=False)

## Intermediate Stage - e.g. URL mapping, column mapping, crate id

### Create IDs 

In [None]:
df_normalized["id"] = df_normalized["name"].apply(lambda x: re.sub("\s+","",x.lower()))

### Select required columns

In [None]:
df_normalized = df_normalized[["id","name","homepage_url","repo_url","category","subcategory"]]

### Save result of intermediate stage

In [None]:
df_normalized.to_csv("data/02_intermediate/lfai_landscape.csv",index=False)

In [None]:
df_normalized.groupby(by="id").count().sum()

##  Processed stage - only keep relevant tools

 Select relevant categories and subcategories

In [None]:

relevant_categories_subcategories={
    "Machine Learning": [ "Platform"], # include Framework?
    "Data": ["Lineage", "Versioning", "Operations","Pipeline Management", "Governance"],
    "Model":["Workflow"],
   # "Notebook Environment":["Notebook Environment"]
}

In [None]:
query_expressions=[
f"(category == '{category}' & subcategory in {subcategories})" for category,subcategories in relevant_categories_subcategories.items()]

In [None]:
filtered_result = df_normalized.query("|".join(query_expressions)).reset_index(drop=True)

### Save result of processed stage

In [None]:
filtered_result.to_csv("data/03_processed/lfai_landscape.csv",index=False)

In [None]:
filtered_result.shape