# Modularization options with Neo4j (executed)

Short demo notebook that integrates various data sources into one graph via the Neo4j graph database.

## Show current package structure

In [None]:
%%bash
tree ./spring-framework-petclinic/src/main/java/

## Set up connection to Neo4J
Needs a running Neo4j instance in the background

### Esablish connection to Neo4j graph database

In [None]:
from neo4j import GraphDatabase

URI = "bolt://localhost"
AUTH = ("neo4j", "neo4j")

driver = GraphDatabase.driver(URI, auth=AUTH)
driver.verify_connectivity()
session = driver.session()

Clean data from previous run

In [None]:
query="""
MATCH (a) -[r] -> () DELETE a, r
"""
session.run(query);

In [None]:
query="""
   MATCH (a) DELETE a
"""
session.run(query);

# Data import

## Import dependencies (from jdeps)

### Generating dataset

You can generate this kind of data with any tool that can show you dependencies between your classes. E.g. in Java, using `jdeps`:
    
    
`jdeps -e 'org.springframework.samples.petclinic.*' -v target/classes/ > spring_petclinic_deps.txt`

*Vorsicht: `jdeps` ist nur eine Annäherung an die Abhängigkeiten. Z. B. werden hier Typen, die in Generics verwendet werden, nicht angezeigt.*

### Show dataset

In [None]:
!head data/spring_petclinic_deps.txt

### Import dataset to pandas

In [None]:
import pandas as pd

deps = pd.read_csv("data/spring_petclinic_deps.txt", names=["raw"], sep="\r")
deps.head()

### Normalize data
*(always a messy thing...)*

In [None]:
# class entries begin with three whitespaces
deps = deps[deps['raw'].str.startswith("   ")]
# separates the source from the target
splitted = deps['raw'].str.split("->", n=1, expand=True)
# remove whitespaces from source and get rid of inner classes
deps['from'] = splitted[0].str.strip().str.split("\$").str[0]
# get the target and the artifact names
splitted_2 = splitted[1].str.split(" ", n=2)
# get also rid of inner classes
deps['to'] = splitted_2.str[1].str.split("\$").str[0]
deps['type'] = splitted_2.str[2].str.strip()
deps['name'] = deps['from'].str.split(".").str[-1]
deps.head()

### Transform data for source code file names into dictionary
To load data into Neo4j, we need a dict-like data structure. We also drop duplicated entries to avoid nodes with the same data.

In [None]:
names_data = deps[['from', 'name']].drop_duplicates().to_dict(orient='records')
names_data[:5]

### Import data into Neo4j

In [None]:
query="""
    UNWIND $data as dep_name
    CREATE (t:Type)
    SET
        t.fqn = dep_name.from,
        t.name = dep_name.name
    RETURN t.fqn, t.name
"""
session.run(query, data=names_data).to_df().head()

### Create index for `fqn` for faster queries
support older and newer versions of Neo4j while creating the index

In [None]:
query = ""

if driver.get_server_info().protocol_version[0] <= 3:
    query = "CREATE INDEX ON :Type(fqn)"
else:
    query = "CREATE INDEX FOR (t:Type) ON (t.fqn)"
    
session.run(query);

### Transform data for dependencies into a dictionary

In [None]:
deps_data = deps[['from', 'to']].to_dict(orient='records')
deps_data[:3]

### Connect nodes that depend on each other

In [None]:
query="""
    UNWIND $data as dep
    MATCH (from:Type {fqn : dep.from})
    MATCH (to:Type {fqn: dep.to})
    MERGE (from)-[:DEPENDS_ON]->(to)
    RETURN from.fqn, to.fqn
"""
session.run(query, data=deps_data).to_df().head()

### Prepare results for dependency analysis

In [None]:
import json
query="""
    MATCH (t:Type)
    WITH DISTINCT t
    MATCH (type)-[:DEPENDS_ON*0..1]->(directDependency:Type)
    RETURN type.fqn as name, COLLECT(DISTINCT directDependency.fqn) as imports
"""

json_data = session.run(query).to_df().to_json(orient="records")
print(json.dumps(json.loads(json_data), indent=4)[:500] + "\n...")

### Visualize dependencies

In [None]:
from IPython.core.display import HTML

with open("vis/template_hierarchical_edge_bundling_d3_inline.html") as html_template:
    html = html_template.read().replace("###JSON###", str(json_data))

    with open(f'output/source_code_file_dependencies.html', mode='w') as html_out:
        html_out.write(html)

HTML('<a href="output/source_code_file_dependencies.html" target="_blank">Source Code Files Dependencies</a>')

## Import lines of code information

### Generate dataset

You can generate this data for various source code projects e.g. via `cloc`:

`src/main/java/cloc . --by-file --quiet --csv --out spring_petclinic_cloc.csv`


### Show dataset

In [None]:
!head data/spring_petclinic_cloc.csv

### Import data

In [None]:
cloc = pd.read_csv("data/spring_petclinic_cloc.csv")[:-1].copy()
cloc.tail()

### Normalize data
`cloc` delivers paths, but we need a full qualified name ("fqn") that matches with exiting data.

In [None]:
cloc['fqn'] = cloc['filename'].str.replace("./", "", regex=False)\
                              .str.replace("/",".", regex=False)\
                              .str.replace(".java","", regex=False)
cloc.head()

### Generate dictionary

In [None]:
cloc_data = cloc.to_dict(orient='records')
cloc_data[:2]

### Import into Neo4j

In [None]:
query="""
    UNWIND $data as loc
    MATCH (t:Type {fqn : loc.fqn})
    SET
        t.lines = loc.code,
        t.comments = loc.comment,
        t.blanks = loc.blank
    RETURN t.fqn, t.name, t.lines, t.comments, t.blanks
"""

session.run(query, data=cloc_data).to_df().head()

## Import usage data

### Generate dataset

E.g. via coverage tools like JaCoCo you can get a glimpse on what's happening during the usage of your application.

See here for more details: https://www.feststelltaste.de/visualizing-production-coverage-with-jacoco-pandas-and-d3/

### Show dataset

In [None]:
!head data/spring_petclinic_production_coverage_data.csv

### Import dataset

In [None]:
coverage = pd.read_csv("data/spring_petclinic_production_coverage_data.csv")
coverage.head()

### Enrich data
Calculate the percentage of executed lines of code per class

In [None]:
coverage['lines'] = coverage.LINE_COVERED + coverage.LINE_MISSED
coverage['ratio'] = coverage.LINE_COVERED / coverage.lines
coverage.head()

### Normalize data

In [None]:
coverage['fqn'] = coverage["PACKAGE"] + "." + coverage["CLASS"]
coverage.head()

### Import data into Neo4j

In [None]:
query="""
    UNWIND $data as coverage
    MATCH (t:Type {fqn : coverage.fqn})
    MERGE (t)-[:HAS_MEASURE]->(m)
    SET 
        m:Measure:Coverage,
        m.ratio = coverage.ratio,
        m.lines = coverage.lines
    RETURN t.fqn as fqn, m.ratio as ratio, m.lines as lines
"""

session.run(query, data=coverage.to_dict(orient='records')).to_df().head()

# Check data

## Query Nodes

### List measures

In [None]:
query="""
   MATCH (n:Type)-[:HAS_MEASURE]->(m:Measure)
   RETURN n.fqn as fqn, n.lines as lines, m.ratio as ratio
"""

module_options = session.run(query).to_df()
module_options.head()

# Explore modularization options

## Explore existing modularization

### Extract existing main module structure

In [None]:
module_options['base_module'] = module_options['fqn'].str.split(".").str[4]
module_options.head()

### Add base module information to graph

In [None]:
query="""
    UNWIND $data as module
    MATCH (t:Type {fqn : module.fqn})
    MERGE (m:Base:Module{name:module.base_module})
    MERGE (t)-[:BELONGS_TO]->(m)
    RETURN t.fqn as fqn, m.name as base_module
"""
session.run(query, data=module_options.to_dict(orient='records')).to_df().head()

### Add base module dependencies to graph

In [None]:
query = """
    MATCH (m1:Base:Module)<-[:BELONGS_TO]-(t1:Type)<-[:DEPENDS_ON]-(t2:Type)-[:BELONGS_TO]->(m2:Base:Module)
    WHERE m1 <> m2
    MERGE (m2)-[:USES]->(m1)
    RETURN DISTINCT(m2.name) as module, m1.name as dependent_module, COUNT(t2) as dependencies
"""
base_module_dependencies = session.run(query).to_df()
base_module_dependencies.head()

### Query for basic module statistics

In [None]:
query="""
    MATCH (t:Type)-[:BELONGS_TO]->(m:Base:Module)
    RETURN m.name as module_name, count(t) as classes
"""

session.run(query).to_df().head()

### Generate JSON output for d3 visualization

In [None]:
json_data = base_module_dependencies.to_dict(orient='split')['data']
print(json.dumps(json_data, indent=4)[:200] + "\n...")

### Export data for visualization

In [None]:
with open("vis/template_chord_diagram_d3_inline.html") as html_template:
    html = html_template.read().replace("###JSON###", str(json_data))

    with open(f'output/chord_diagram_base_module.html', mode='w') as html_out:
        html_out.write(html)

HTML('<a href="output/chord_diagram_base_module.html" target="_blank">Open Chord Diagram for Base Modules</a>')

## Explore alternative modularization options

In [None]:
module_options.head()

### Extract domain based modules
*(here we use a very simple heuristic by using domain-related names that a part of the class names)*

In [None]:
domain_parts = ["Owner", "Pet", "Visit", "Vet", "Specialty", "Clinic"]

for domain_part in domain_parts:
    module_options.loc[module_options['fqn'].str.contains(domain_part), 'domain_part'] = domain_part

module_options.head()

### Come up with an alternative structure

In [None]:
domain_part_mapping = {
    "Visit" : "Checkup",
    "Pet" : "Patient",
    "Owner" : "Patient",
    "Vet" : "Doctor",
    "Specialty" : "Doctor"
} 
    
module_options['domain'] = module_options['domain_part'].map(domain_part_mapping).fillna("Framework")
module_options.head()

### Add alternative modules to graph

In [None]:
query="""
    UNWIND $data as module
    MATCH (t:Type {fqn : module.fqn})
    MERGE (m:Domain:Module{name:module.domain})
    MERGE (t)-[:BELONGS_TO]->(m)
    RETURN t.fqn as fqn, m.name
"""

session.run(query, data=module_options.to_dict(orient='records')).to_df().head()

### Add base module dependencies to graph

In [None]:
query = """
    MATCH (m1:Domain:Module)<-[:BELONGS_TO]-(t1:Type)<-[:DEPENDS_ON]-(t2:Type)-[:BELONGS_TO]->(m2:Domain:Module)
    WHERE m1 <> m2
    MERGE (m2)-[:USES]->(m1)
    RETURN DISTINCT(m2.name) as module, m1.name as dependent_module, COUNT(t2) as dependencies, SUM(t2.lines) as lines
"""
domain_module_dependencies = session.run(query).to_df()
domain_module_dependencies.head()

### Visualize alternative modularization

In [None]:
json_data = domain_module_dependencies.to_dict(orient='split')['data']

with open("vis/template_chord_diagram_d3_inline.html") as html_template:
    html = html_template.read().replace("###JSON###", str(json_data))

    with open(f'output/chord_diagram_domain_module.html', mode='w') as html_out:
        html_out.write(html)

HTML('<a href="output/chord_diagram_domain_module.html" target="_blank">Open Chord Diagram for Domain Modules</a>')

### Add base module dependencies to graph

In [None]:
query = """
    MATCH (m1:Domain:Module)<-[:BELONGS_TO]-(t1:Type)<-[:DEPENDS_ON]-(t2:Type)-[:BELONGS_TO]->(m2:Domain:Module)
    RETURN DISTINCT(m2.name) as module, m1.name as dependent_module, COUNT(t2) as dependencies
"""
domain_module_dependencies = session.run(query).to_df()
json_data = domain_module_dependencies.to_dict(orient='split')['data']
with open ( "output/chord-diagram.json", mode='w') as json_file:
    json_file.write(json.dumps(json_data, indent=3))
json_data

### Prepare results for dependency analysis

In [None]:
query="""
MATCH (m:Domain:Module)-[:USES]->(m_dep:Domain:Module)
RETURN m.name as name, COLLECT(DISTINCT m_dep.name) as imports
"""

json_data = session.run(query).to_df().to_json(orient="records")
print(json_data[:200])

### Create visualization based on data

In [None]:
with open("vis/template_hierarchical_edge_bundling_d3_inline.html") as html_template:
    html = html_template.read().replace("###JSON###", str(json_data))

    with open(f'output/domain_modules_dependencies.html', mode='w') as html_out:
        html_out.write(html)

HTML('<a href="output/domain_modules_dependencies.html" target="_blank">Domain Modules Dependencies</a>')

## Analyze weird dependencies from Framework to other modules

### List all classes in the Framework module

In [None]:
query = """
    MATCH (m1:Domain:Module {name:"Framework"})<-[:BELONGS_TO]-(t1:Type)
    RETURN t1.name as FrameworkType
"""
session.run(query).to_df()

### List dependencies from Framework to domain modules

In [None]:
query = """
    MATCH (m1:Domain:Module {name:"Framework"})<-[:BELONGS_TO]-(t1:Type)-[:DEPENDS_ON]->(t2:Type)-[:BELONGS_TO]->(m2:Domain:Module)
    RETURN t1.name as FrameworkType, t2.name as DomainType, m2.name as DomainModule
"""
session.run(query).to_df()