# Machine Learning Glossary 

> **Note:** Portions of this page are reproduced from work created and shared by Google and used according to terms described in the Creative Commons 4.0 Attribution License.

[Google for Developers](https://developers.google.com/terms/site-policies)

#### Libraries

In [9]:
# processing:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup

# neo4j:
from neo4j import GraphDatabase

## Parsing

#### Text

In [11]:
# webpage
url = 'https://developers.google.com/machine-learning/glossary'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# extracting a specific section by class name:
section = soup.find('div', class_='devsite-article-body clearfix')

text = section.get_text()
print(text[1:90])


This glossary defines general machine learning terms, plus
terms specific to TensorFlow.


#### Terms

In [20]:
headers = section.find_all('h2', class_='hide-from-toc')

terms = []

# extracting 'id' and 'data-text' attributes
for header in headers:
    header_id = header.get('id')
    data_text = header.get('data-text')
    terms.append(data_text.strip())  # Remove trailing spaces

terms[:5]

['ablation', 'A/B testing', 'accelerator chip', 'accuracy', 'action']

In [21]:
len(terms)

597

In [37]:
with open('terms.txt', 'w') as f:
    for term in terms:
        f.write("%s\n" % term)

In [66]:
token = 'normal'

[term for term in terms if token in term]
# [term for term in terms if term.startswith(token)]

['batch normalization', 'normalization', 'Z-score normalization']

#### Glossary

In [89]:
glossary = {}

# extracting 'id' and 'data-text' attributes
for header in headers:
    header_id = header.get('id')
    data_text = header.get('data-text')
    description = ""

    # find all tags that come after the current header
    next_tags = header.find_all_next()
    for tag in next_tags:
        # if we encounter another 'h2' tag, we've gone too far, so break the loop
        if tag.name == 'h2':
            break

        # if we encounter a 'p' tag that does not contain a 'glossary-anchor', add its text to the description
        if tag.name == 'p' and tag.get_text().strip() and not tag.find('a', class_='glossary-anchor'):
            description += tag.get_text() + " "
        
        # if we encounter a 'ul' tag, add the text from each 'li' tag within it to the description
        if tag.name == 'ul':
            for li in tag.find_all('li'):
                description += li.get_text() + " "

    # add the description to the glossary under the current header
    glossary[data_text] = description.strip()

# convert the glossary into a DataFrame
data = pd.DataFrame(list(glossary.items()), columns=['Term', 'Description'])

# save the DataFrame as a CSV file
data.to_csv('machine_learning_glossary.csv', index=False)

## Knowledge Graph

#### .env

In [69]:
load_dotenv()

True

In [70]:
uri = os.getenv('NEO4J_URI')
username = os.getenv('NEO4J_USERNAME')
password = os.getenv('NEO4J_PASSWORD')
database = os.getenv('NEO4J_DATABASE')

#### neo4j 

In [76]:
driver = GraphDatabase.driver(uri, auth = ( username, password ))

In [None]:
# function to create nodes and relationships
def create_graph(tx, term, description):
    # create the node for the main term
    tx.run("MERGE (t:Term {name: $term, description: $description})", term=term, description=description)
    
    # find mentions of other terms in the description and create relationships
    for other_term in data['Term']:
        if other_term != term and other_term.lower() in description.lower():
            tx.run("MATCH (t1:Term {name: $term}), (t2:Term {name: $other_term}) "
                   "MERGE (t1)-[:MENTIONS]->(t2)", term=term, other_term=other_term)

# adding nodes and relationships to the graph
with driver.session(database=database) as session:
    for index, row in data.iterrows():
        session.execute_write(create_graph, row['Term'], row['Description'])

driver.close()

In [None]:
def create_linear_regression_graph(driver):
    with driver.session() as session:

        # clear the graph
        session.run("MATCH (n) DETACH DELETE n")

        # create nodes and relationships
        session.run("""
        CREATE
        (ml:Concept {name: 'Machine Learning'}),
        (coreConcepts:Concept {name: 'Core Concepts'}),
        (modelTypes:Concept {name: 'Model Types'}),
        (dataPrep:Concept {name: 'Data Preparation'}),
        (featureEng:Concept {name: 'Feature Engineering'}),
        (performanceEval:Concept {name: 'Performance Evaluation'}),
        (optimization:Concept {name: 'Optimization Techniques'}),
        (lr:Concept {name: 'Linear Regression'}),
        (mse:Concept {name: 'Mean Squared Error'}),
        (rmse:Concept {name: 'Root Mean Squared Error'}),
        (sgd:Concept {name: 'Stochastic Gradient Descent'}),
        (norm:Concept {name: 'Normalization'}),
        (featSel:Concept {name: 'Feature Selection'}),
        
        (lr)-[:IS_A]->(modelTypes),
        (modelTypes)-[:PART_OF]->(ml),
        (sgd)-[:USED_FOR]->(lr),
        (mse)-[:USED_IN]->(lr),
        (rmse)-[:SPECIFIC_FORM]->(mse),
        (dataPrep)-[:USED_IN]->(lr),
        (norm)-[:IS_A]->(dataPrep),
        (featSel)-[:IS_A]->(featureEng),
        (featureEng)-[:PART_OF]->(dataPrep),
        (performanceEval)-[:EVALUATES]->(lr),
        (sgd)-[:DEPENDS_ON]->(mse),
        (optimization)-[:ENCOMPASSES]->(sgd),
        (optimization)-[:PART_OF]->(coreConcepts),
        (coreConcepts)-[:PART_OF]->(ml)
        """)

# create the graph based on Linear Regression and related concepts as an example
create_linear_regression_graph(driver)
print("Linear Regression graph has been created.")
