# Data import

## Import dependencies (from jdeps)

Import dataset

In [1]:
import pandas as pd

deps = pd.read_csv("data/spring_petclinic_deps.txt", names=["raw"], sep="\n")
deps.head()

Unnamed: 0,raw
0,spring-petclinic-2.4.5.jar -> not found
1,spring-petclinic-2.4.5.jar -> /usr/lib/jvm/jav...
2,org.springframework.boot.loader.ClassPathIn...
3,org.springframework.boot.loader.ClassPathIn...
4,org.springframework.boot.loader.ClassPathIn...


Normalize data

In [2]:
# class entries begin with three whitespaces
deps = deps[deps['raw'].str.startswith("   ")]
# separates the source from the target
splitted = deps['raw'].str.split("->", n=1, expand=True)
# remove whitespaces from source
deps['from'] = splitted[0].str.strip()
# get the target and the artifact names
splitted_2 = splitted[1].str.split(" ", n=2)
deps['to'] = splitted_2.str[1]
deps['type'] = splitted_2.str[2].str.strip()
deps['name'] = deps['from'].str.split(".").str[-1]
deps.head()

Unnamed: 0,raw,from,to,type,name
2,org.springframework.boot.loader.ClassPathIn...,org.springframework.boot.loader.ClassPathIndex...,java.io.BufferedReader,,ClassPathIndexFile
3,org.springframework.boot.loader.ClassPathIn...,org.springframework.boot.loader.ClassPathIndex...,java.io.File,,ClassPathIndexFile
4,org.springframework.boot.loader.ClassPathIn...,org.springframework.boot.loader.ClassPathIndex...,java.io.FileInputStream,,ClassPathIndexFile
5,org.springframework.boot.loader.ClassPathIn...,org.springframework.boot.loader.ClassPathIndex...,java.io.IOException,,ClassPathIndexFile
6,org.springframework.boot.loader.ClassPathIn...,org.springframework.boot.loader.ClassPathIndex...,java.io.InputStream,,ClassPathIndexFile


Focus on core application

In [3]:
petclinic_deps = deps[
    (
        deps['from'].str.startswith("org.springframework.samples.petclinic") &
        deps['to'].str.startswith("org.springframework.samples.petclinic")
    )
    ].copy()
petclinic_deps.head()

Unnamed: 0,raw,from,to,type,name
882,org.springframework.samples.petclinic.model...,org.springframework.samples.petclinic.model.Na...,org.springframework.samples.petclinic.model.Ba...,spring-petclinic-2.4.5.jar,NamedEntity
887,org.springframework.samples.petclinic.model...,org.springframework.samples.petclinic.model.Pe...,org.springframework.samples.petclinic.model.Ba...,spring-petclinic-2.4.5.jar,Person
908,org.springframework.samples.petclinic.owner...,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.model.Pe...,spring-petclinic-2.4.5.jar,Owner
909,org.springframework.samples.petclinic.owner...,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.owner.Pet,spring-petclinic-2.4.5.jar,Owner
919,org.springframework.samples.petclinic.owner...,org.springframework.samples.petclinic.owner.Ow...,org.springframework.samples.petclinic.owner.Owner,spring-petclinic-2.4.5.jar,OwnerController


## Import coverage data (from JaCoCo)

Import dataset

In [4]:
coverage = pd.read_csv("data/spring_petclinic_production_coverage_data.csv")
coverage.head()

Unnamed: 0,PACKAGE,CLASS,LINE_MISSED,LINE_COVERED
0,org.springframework.samples.petclinic,PetclinicInitializer,0,24
1,org.springframework.samples.petclinic.model,NamedEntity,1,4
2,org.springframework.samples.petclinic.model,Specialty,0,1
3,org.springframework.samples.petclinic.model,PetType,0,1
4,org.springframework.samples.petclinic.model,Vets,4,0


Enrich data

In [5]:
coverage['lines'] = coverage.LINE_COVERED + coverage.LINE_MISSED
coverage['ratio'] = coverage.LINE_COVERED / coverage.lines
coverage.head()

Unnamed: 0,PACKAGE,CLASS,LINE_MISSED,LINE_COVERED,lines,ratio
0,org.springframework.samples.petclinic,PetclinicInitializer,0,24,24,1.0
1,org.springframework.samples.petclinic.model,NamedEntity,1,4,5,0.8
2,org.springframework.samples.petclinic.model,Specialty,0,1,1,1.0
3,org.springframework.samples.petclinic.model,PetType,0,1,1,1.0
4,org.springframework.samples.petclinic.model,Vets,4,0,4,0.0


Normalize data

In [6]:
coverage['fqn'] = coverage["PACKAGE"] + "." + coverage["CLASS"]
coverage.head()

Unnamed: 0,PACKAGE,CLASS,LINE_MISSED,LINE_COVERED,lines,ratio,fqn
0,org.springframework.samples.petclinic,PetclinicInitializer,0,24,24,1.0,org.springframework.samples.petclinic.Petclini...
1,org.springframework.samples.petclinic.model,NamedEntity,1,4,5,0.8,org.springframework.samples.petclinic.model.Na...
2,org.springframework.samples.petclinic.model,Specialty,0,1,1,1.0,org.springframework.samples.petclinic.model.Sp...
3,org.springframework.samples.petclinic.model,PetType,0,1,1,1.0,org.springframework.samples.petclinic.model.Pe...
4,org.springframework.samples.petclinic.model,Vets,4,0,4,0.0,org.springframework.samples.petclinic.model.Vets


## Import source code data

Import data

In [7]:
cloc = pd.read_csv("data/spring_petclinic_cloc.csv")[:-1].copy()
cloc.tail()

Unnamed: 0,language,filename,blank,comment,code,"github.com/AlDanial/cloc v 1.82 T=0.33 s (75.4 files/s, 4843.0 lines/s)"
20,Java,./org/springframework/samples/petclinic/vet/Sp...,5,20,9,
21,Java,./org/springframework/samples/petclinic/visit/...,6,31,9,
22,Java,./org/springframework/samples/petclinic/PetCli...,5,21,9,
23,Java,./org/springframework/samples/petclinic/owner/...,4,18,8,
24,Java,./org/springframework/samples/petclinic/model/...,1,18,1,


Normalize data

In [8]:
cloc['fqn'] = cloc['filename'].str.replace("./", "", regex=False)\
                              .str.replace("/",".", regex=False)\
                              .str.replace(".java","", regex=False)
cloc.head()

Unnamed: 0,language,filename,blank,comment,code,"github.com/AlDanial/cloc v 1.82 T=0.33 s (75.4 files/s, 4843.0 lines/s)",fqn
0,Java,./org/springframework/samples/petclinic/owner/...,18,31,96,,org.springframework.samples.petclinic.owner.Ow...
1,Java,./org/springframework/samples/petclinic/owner/...,23,33,94,,org.springframework.samples.petclinic.owner.Owner
2,Java,./org/springframework/samples/petclinic/owner/...,16,20,77,,org.springframework.samples.petclinic.owner.Pe...
3,Java,./org/springframework/samples/petclinic/owner/...,19,22,71,,org.springframework.samples.petclinic.owner.Pet
4,Java,./org/springframework/samples/petclinic/owner/...,12,31,49,,org.springframework.samples.petclinic.owner.Vi...


In [9]:
loc = cloc[['fqn', 'code', 'comment', 'blank']].dropna().copy()
loc.head()

Unnamed: 0,fqn,code,comment,blank
0,org.springframework.samples.petclinic.owner.Ow...,96,31,18
1,org.springframework.samples.petclinic.owner.Owner,94,33,23
2,org.springframework.samples.petclinic.owner.Pe...,77,20,16
3,org.springframework.samples.petclinic.owner.Pet,71,22,19
4,org.springframework.samples.petclinic.owner.Vi...,49,31,12


# Load data into Neo4J

Esablish connection to Neo4j graph database

In [10]:
from py2neo import Graph
graph = Graph("http://localhost:7474/db/data")
graph

Graph('http://localhost:7474')

## jdeps

In [11]:
petclinic_deps.head()

Unnamed: 0,raw,from,to,type,name
882,org.springframework.samples.petclinic.model...,org.springframework.samples.petclinic.model.Na...,org.springframework.samples.petclinic.model.Ba...,spring-petclinic-2.4.5.jar,NamedEntity
887,org.springframework.samples.petclinic.model...,org.springframework.samples.petclinic.model.Pe...,org.springframework.samples.petclinic.model.Ba...,spring-petclinic-2.4.5.jar,Person
908,org.springframework.samples.petclinic.owner...,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.model.Pe...,spring-petclinic-2.4.5.jar,Owner
909,org.springframework.samples.petclinic.owner...,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.owner.Pet,spring-petclinic-2.4.5.jar,Owner
919,org.springframework.samples.petclinic.owner...,org.springframework.samples.petclinic.owner.Ow...,org.springframework.samples.petclinic.owner.Owner,spring-petclinic-2.4.5.jar,OwnerController


In [12]:
query="""
    UNWIND {deps_data} as dep
    CREATE
        (t:Type)
    SET
        t.fqn = dep.from,
        t.name = dep.name
    RETURN t.fqn, t.name
"""

result = graph.run(query, deps_data=petclinic_deps[['from', 'name']].drop_duplicates().to_dict(orient='records')).data()
pd.DataFrame(result).head()

Unnamed: 0,t.fqn,t.name
0,org.springframework.samples.petclinic.model.Na...,NamedEntity
1,org.springframework.samples.petclinic.model.Pe...,Person
2,org.springframework.samples.petclinic.owner.Owner,Owner
3,org.springframework.samples.petclinic.owner.Ow...,OwnerController
4,org.springframework.samples.petclinic.owner.Ow...,OwnerRepository


Create index for `fqn" for faster queries

In [13]:
query="""
  CREATE INDEX ON :J(fqn)
"""
graph.run(query)

In [14]:
query="""
    UNWIND {deps_data} as dep
    MATCH (from:Type {fqn : dep.from})
    MATCH (to:Type {fqn: dep.to})
    MERGE (from)-[:DEPENDS_ON]->(to)
    RETURN from.fqn, to.fqn
"""

result = graph.run(query, deps_data=petclinic_deps.to_dict(orient='records')).data()
pd.DataFrame(result).head()

Unnamed: 0,from.fqn,to.fqn
0,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.model.Pe...
1,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.model.Pe...
2,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.model.Pe...
3,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.model.Pe...
4,org.springframework.samples.petclinic.owner.Owner,org.springframework.samples.petclinic.owner.Pet


## JaCoCo coverage data

In [15]:
coverage.head()

Unnamed: 0,PACKAGE,CLASS,LINE_MISSED,LINE_COVERED,lines,ratio,fqn
0,org.springframework.samples.petclinic,PetclinicInitializer,0,24,24,1.0,org.springframework.samples.petclinic.Petclini...
1,org.springframework.samples.petclinic.model,NamedEntity,1,4,5,0.8,org.springframework.samples.petclinic.model.Na...
2,org.springframework.samples.petclinic.model,Specialty,0,1,1,1.0,org.springframework.samples.petclinic.model.Sp...
3,org.springframework.samples.petclinic.model,PetType,0,1,1,1.0,org.springframework.samples.petclinic.model.Pe...
4,org.springframework.samples.petclinic.model,Vets,4,0,4,0.0,org.springframework.samples.petclinic.model.Vets


In [16]:
query="""
    UNWIND {coverage_data} as coverage
    MATCH (t:Type {fqn : coverage.fqn})
    MERGE (t)-[:HAS_MEASURE]->(m)
    SET 
        m:Measure:Coverage, 
        m.ratio = coverage.ratio
    RETURN t.fqn as fqn, m.ratio as ratio
"""

result = graph.run(query, coverage_data=coverage.to_dict(orient='records')).data()
pd.DataFrame(result).head()

Unnamed: 0,fqn,ratio
0,org.springframework.samples.petclinic.model.Na...,0.8
1,org.springframework.samples.petclinic.model.Na...,0.8
2,org.springframework.samples.petclinic.model.Pe...,1.0
3,org.springframework.samples.petclinic.model.Pe...,1.0


## cloc data

In [17]:
loc.head()

Unnamed: 0,fqn,code,comment,blank
0,org.springframework.samples.petclinic.owner.Ow...,96,31,18
1,org.springframework.samples.petclinic.owner.Owner,94,33,23
2,org.springframework.samples.petclinic.owner.Pe...,77,20,16
3,org.springframework.samples.petclinic.owner.Pet,71,22,19
4,org.springframework.samples.petclinic.owner.Vi...,49,31,12


In [18]:
query="""
    UNWIND {cloc_data} as loc
    MATCH (t:Type {fqn : loc.fqn})
    SET
        t.lines = loc.code,
        t.comments = loc.comment,
        t.blanks = loc.blank
    RETURN t.fqn, t.name, t.lines, t.comments, t.blanks
"""

result = graph.run(query, cloc_data=loc.to_dict(orient='records')).data()
pd.DataFrame(result).head()

Unnamed: 0,t.fqn,t.name,t.lines,t.comments,t.blanks
0,org.springframework.samples.petclinic.owner.Ow...,OwnerController,96,31,18
1,org.springframework.samples.petclinic.owner.Ow...,OwnerController,96,31,18
2,org.springframework.samples.petclinic.owner.Owner,Owner,94,33,23
3,org.springframework.samples.petclinic.owner.Owner,Owner,94,33,23
4,org.springframework.samples.petclinic.owner.Pe...,PetController,77,20,16
