In [None]:
from selenium.webdriver.common.by import By
import undetected_chromedriver.v2 as uc
from pyvirtualdisplay import Display
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
from transformers import pipeline
from neo4j import GraphDatabase
import torch

def extract_text_by_class(class_name):
    """
    Extract text from an element with the specified class name
    """
    global wd
    try:
        content = wd.find_element(By.CLASS_NAME, class_name)
        return content.text
    except:
        return ""


def extract_links_by_xpath(xpath):
    global wd
    links = set()
    try:
        a_elems = wd.find_elements(By.XPATH, xpath)
        for elem in a_elems:
            link = elem.get_attribute("href")
            if link == "javascript:void(0)":
                continue
            # Remove links to images and various files
            if (
                link.endswith(".png")
                or link.endswith(".json")
                or link.endswith(".txt")
                or link.endswith(".svg")
                or link.endswith(".ipynb")
                or link.endswith(".jpg")
                or link.endswith(".pdf")
                or link.endswith(".mp4")
                or "mailto" in link
                or len(link) > 300
            ):
                continue
            # Remove anchors
            link = link.split("#")[0]
            # Remove parameters
            link = link.split("?")[0]
            # Remove trailing forward slash
            link = link.rstrip("/")
            links.add(link)
        return list(links)
    except:
        return []

In [None]:


tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained(
    "yanekyuk/bert-uncased-keyword-extractor"
)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


def extract_keywords(text):
    """
    Extract keywords and construct them back from tokens
    """
    result = list()
    keyword = ""
    for token in nlp(text):
        if token["entity"] == "I-KEY":
            keyword += (
                token["word"][2:]
                if token["word"].startswith("##")
                else f" {token['word']}"
            )
        else:
            if keyword:
                result.append(keyword)
            keyword = token["word"]
    # Add the last keyword
    result.append(keyword)
    return list(set(result))

In [None]:
extract_keywords(
    """
Broadcom agreed to acquire cloud computing company VMware in a $61 billion (€57bn) cash-and stock deal.
"""
)

In [None]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


def generate_embeddings(text):
    embeddings = model.encode(text)
    return [float(x) for x in embeddings.tolist()]

In [None]:
generate_embeddings(
    """
Web APIs are a huge opportunity to access and integrate data from any sources with your graph. Most of them provide the data in JSON format.

The Load JSON procedures retrieve data from URLs or maps and turn it into map value(s) for Cypher to consume. Cypher has support for deconstructing nested documents with dot syntax, slices, UNWIND etc. so it is easy to turn nested data into graphs.

Sources with multiple JSON objects (JSONL,JSON Lines) in a stream, like the streaming Twitter format or the Yelp Kaggle dataset, are also supported,
"""
)

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from sentence_transformers import SentenceTransformer

def extract_text_by_class(class_name):
    """
    Extract text from an element with the specified class name
    """
    global wd
    try:
        content = wd.find_element(By.CLASS_NAME, class_name)
        return content.text
    except:
        return ""

def extract_links_by_xpath(xpath):
    global wd
    links = set()
    try:
        a_elems = wd.find_elements(By.XPATH, xpath)
        for elem in a_elems:
            link = elem.get_attribute("href")
            if link == "javascript:void(0)":
                continue
            
            if (
                link.endswith(".png")
                or link.endswith(".json")
                or link.endswith(".txt")
                or link.endswith(".svg")
                or link.endswith(".ipynb")
                or link.endswith(".jpg")
                or link.endswith(".pdf")
                or link.endswith(".mp4")
                or "mailto" in link
                or len(link) > 300
            ):
                continue
            
            link = link.split("#")[0]
            
            link = link.split("?")[0]
            
            link = link.rstrip("/")
            links.add(link)
        return list(links)
    except:
        return []

def generate_embeddings(text):
    embeddings = model.encode(text)
    return [float(x) for x in embeddings.tolist()]


options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
wd = webdriver.Chrome(ChromeDriverManager().install(), options=options)


model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

entry_url = "https://neo4j.com/docs"
data = dict()
visit_list = [entry_url]
already_visited = []
visited_links_count = 0  # Sayaç

while visit_list and visited_links_count < 20:  # 100'e ulaşana kadar devam et
    
    current_url = visit_list.pop()
    if current_url in already_visited:
        continue
    print(current_url)
    try:
        wd.get(current_url)
    except:
        print(f"Couldn't open {current_url}")
        already_visited.append(current_url)
        continue
    
    try:
        actual_url = wd.current_url.rstrip("/").split("#")[0].split("?")[0]
        if actual_url != current_url:
            
            data[current_url] = {
                "links": [],
                "text": None,
                "embeddings": [],
                "keywords": [],
                "redirects": [actual_url],
            }
            already_visited.append(current_url)
            
            current_url = actual_url
    except:
        pass
    
    text = extract_text_by_class("content")
    
    if not text:
        text = extract_text_by_class("article")
   
    if not text:
        text = extract_text_by_class("page")
    if not text:
        text = extract_text_by_class("single-user-story")
    
    try:
        if "Sorry, page not found" in wd.find_element(By.TAG_NAME, "body").text:
            text = "404"
    except:
        pass

    
    if text:
        embeddings = generate_embeddings(text)
        keywords = extract_keywords(text)
    else:
        embeddings = []
        keywords = []

 
    links = extract_links_by_xpath("//div[@class='content']//a[@href]")
    
    if not links:
        links = extract_links_by_xpath("//article[@class='article']//a[@href]")
    if not links:
        links = extract_links_by_xpath("//article//a[@href]")

   
    data[current_url] = {
        "links": [l for l in links if l != current_url],
        "text": text,
        "embeddings": embeddings,
        "keywords": keywords,
        "redirects": [],
    }
    
    already_visited.append(current_url)
    visited_links_count += 1  
    visit_list.extend(
        [
            l
            for l in list(links)
            if ("neo4j.com" in l)
            and (not l in already_visited)
            and (not "community.neo4j.com" in l)
            and (not "sandbox.neo4j.com" in l)
        ]
    )


wd.quit()


In [None]:
# neo4j bağlantısı kontrol

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

try:
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as session:
        result = session.run("RETURN 1")
        for record in result:
            print(record)
except Exception as e:
    print("Error:", e)

In [None]:
# Yalnızca GDS ile çalıştır

import json

j = json.dumps(data)

f = open("neo4j_docs.json", "w")


f.write(j)


f.close()

from graphdatascience import GraphDataScience

host = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

gds = GraphDataScience(host, auth=(user, password))

gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE;
"""
)

gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (k:Keyword) REQUIRE k.name IS UNIQUE;
"""
)

In [None]:
# yalnızca GDM ile çalıştır

from neo4j import GraphDatabase
import json

j = json.dumps(data)

f = open("neo4j_docs.json", "w")


f.write(j)


f.close()


uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"
driver = GraphDatabase.driver(uri, auth=(user, password))


with driver.session() as session:
   
    j = json.dumps(data)
    with open("neo4j_docs.json", "w") as f:
        f.write(j)

    
    session.run("""
        CREATE CONSTRAINT IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE;
    """)
    
    session.run("""
        CREATE CONSTRAINT IF NOT EXISTS FOR (k:Keyword) REQUIRE k.name IS UNIQUE;
    """)

driver.close()

In [None]:
# yalnızca gds ile çalıştır

import_query = """

UNWIND $data AS row
MERGE (p:Page {url:row.url})
SET p.embedding = row.embedding,
    p.has_text = row.has_text,
    p.is_404 = row.is_404
FOREACH (l in row.links    | MERGE (p1:Page {url:l}) MERGE (p)-[:LINKS_TO]->(p1))
FOREACH (k in row.keywords | MERGE (k1:Keyword {name:k}) MERGE (p)-[:HAS_KEYWORD]->(k1))
FOREACH (r in row.redirects| MERGE (r1:Page {url:r}) MERGE (p)-[:REDIRECTS]->(r1))

"""
x = 1
params = []
for key in data:
    params.append(
        {
            "url": key,
            "embedding": data[key]["embeddings"],
            "keywords": data[key]["keywords"],
            "links": data[key]["links"],
            "has_text": True if data[key]["text"] else False,
            "is_404": True if data[key]["text"] == "404" else False,
            "redirects": data[key]["redirects"],
        }
    ),
    
    if len(params) == 500:
        gds.run_cypher(import_query, {"data": params})
        params = []
        # Logging
        print(f"Importing {x} batch")
        x += 1

gds.run_cypher(import_query, {"data": params})

In [None]:
# yalnızca GDM ile çalıştır

import json

from neo4j import GraphDatabase

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

driver = GraphDatabase.driver(uri, auth=(user, password))

import_query = """
UNWIND $data AS row
MERGE (p:Page {url:row.url})
SET p.embedding = row.embedding,
    p.has_text = row.has_text,
    p.is_404 = row.is_404
FOREACH (l in row.links    | MERGE (p1:Page {url:l}) MERGE (p)-[:LINKS_TO]->(p1))
FOREACH (k in row.keywords | MERGE (k1:Keyword {name:k}) MERGE (p)-[:HAS_KEYWORD]->(k1))
FOREACH (r in row.redirects| MERGE (r1:Page {url:r}) MERGE (p)-[:REDIRECTS]->(r1))
"""

x = 1
params = []

for key in data:
    params.append(
        {
            "url": key,
            "embedding": data[key]["embeddings"],
            "keywords": data[key]["keywords"],
            "links": data[key]["links"],
            "has_text": True if data[key]["text"] else False,
            "is_404": True if data[key]["text"] == "404" else False,
            "redirects": data[key]["redirects"],
        }
    )

    
    if len(params) == 500:
        with driver.session() as session:
            session.run(import_query, {"data": params})
            params = []
            # Logging
            print(f"Importing {x} batch")
            x += 1


with driver.session() as session:
    session.run(import_query, {"data": params})

driver.close()


In [None]:
from neo4j import GraphDatabase

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
CALL apoc.meta.stats()
YIELD labels, relTypesCount
"""

with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record["labels"], record["relTypesCount"])

driver.close()

In [None]:
from neo4j import GraphDatabase

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (p:Page)
RETURN p.has_text AS has_text,
       count(*) AS count
"""

with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record["has_text"], record["count"])

driver.close()


In [None]:
from neo4j import GraphDatabase

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (p:Page)
WHERE p.has_text IS NULL
WITH p, [(p)<-[:LINKS_TO|REDIRECTS]-() | 1] AS links
RETURN p.url AS page, size(links) AS links_count
ORDER BY links_count DESC
LIMIT 5
"""

with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record["page"], record["links_count"])

driver.close()


In [None]:
# GDS version
# 
#  gds.run_cypher("""
#  MATCH (:Page)-[:LINKS_TO|REDIRECTS]->(:Page{is_404:true})
#  RETURN count(*) AS brokenLinkCount
#  """)

In [None]:
from neo4j import GraphDatabase

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (:Page)-[:LINKS_TO|REDIRECTS]->(:Page{is_404:true})
RETURN count(*) AS brokenLinkCount
"""

with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record["brokenLinkCount"])

driver.close()


In [None]:
# from graphdatascience import GraphProjector

# GDS grafiğini oluştur
# graph_projector = GraphProjector(host, auth=(user, password))
# G = graph_projector.project_graph(
#    graph="structure",
#    node_label="Page",
#    relationship_types=["LINKS_TO", "REDIRECTS"]
#)

In [None]:
# ****** GDS İLE PAGERANK ÖNEMLİ ****

#from graphdatascience import PageRank

# PageRank skorlarını hesapla
#pagerank = PageRank()
#pr_df = pagerank.fit_transform(G)

# Skor sütununu "pagerank" olarak yeniden adlandır
#pr_df.rename(columns={"score": "pagerank"}, inplace=True)

# DataFrame'leri birleştir
#combined_df = df.merge(pr_df, on="nodeId")

# PageRank'a göre sırala
#combined_df.sort_values("pagerank", ascending=False, inplace=True)

In [None]:
from neo4j import GraphDatabase

uri = "neo4j+s://76cc1e9c.databases.neo4j.io"
user = "neo4j"
password = "nxPD_cEI0UMmTyBopgn5HZncmGcxuHNLuGzEJVy_4s0"

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (p:Page)-[:HAS_KEYWORD]->(k:Keyword)
RETURN p.url AS page,
       COLLECT(k.name) AS keywords
"""

with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record["page"], record["keywords"])

driver.close()