#### SDM Course Project
# Preprocessing
*Darryl Abraham, Riccardo Paciello*

#### Import Libraries

In [1]:
import pandas as pd
from rdflib import Graph
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scipy
from graphviz import Digraph

#### Import Data

In [2]:
def parse_turtle_file(turtle_file):
    g = Graph()
    g.parse(turtle_file, format="turtle")
    
    triples = []
    for subj, pred, obj in g:
        triples.append((str(subj), str(pred), str(obj)))
        
    return triples

In [3]:
turtle_file = "data/data.ttl"
triples = parse_turtle_file(turtle_file)
df = pd.DataFrame(triples, columns=["head", "relation", "tail"])

#### Preprocessing

In [4]:
df.head()

Unnamed: 0,head,relation,tail
0,http://sdmlab2.org/Paper/16212,http://sdmlab2.org/hasKeyword,http://sdmlab2.org/Keyword/36850
1,http://sdmlab2.org/Paper/10811,http://sdmlab2.org/citedBy,http://sdmlab2.org/Paper/15323
2,http://sdmlab2.org/Paper/14094,http://sdmlab2.org/citedBy,http://sdmlab2.org/Paper/14575
3,http://sdmlab2.org/Paper/13182,http://sdmlab2.org/citedBy,http://sdmlab2.org/Paper/14136
4,http://sdmlab2.org/Author/27105,http://sdmlab2.org/hasName,Reuben D. Budiardja


In [5]:
xid = df[df.apply(lambda x: x.str.startswith('http://www.w3.org/').any(), axis=1)].index

In [6]:
def shorten_uri(uri):
    uri = uri.removeprefix("http://sdmlab2.org/")
    uri = uri.split('#')[-1]
    return uri
    #return uri.split('/')[-2] + '/' + uri.split('/')[-1] if 'sdmlab2.org' in uri else uri

df['head'] = df['head'].apply(shorten_uri)
df['relation'] = df['relation'].apply(shorten_uri)
df['tail'] = df['tail'].apply(shorten_uri)

In [7]:
df.head()

Unnamed: 0,head,relation,tail
0,Paper/16212,hasKeyword,Keyword/36850
1,Paper/10811,citedBy,Paper/15323
2,Paper/14094,citedBy,Paper/14575
3,Paper/13182,citedBy,Paper/14136
4,Author/27105,hasName,Reuben D. Budiardja


In [8]:
df.loc[xid].head()

Unnamed: 0,head,relation,tail
4883,publishedInVolume,range,Volume
17161,hasText,domain,Review
20906,wroteReview,subPropertyOf,Authored
30835,relevantAuthorOf,type,Property
32681,heldIn,type,Property


In [9]:
df.to_csv("data/preprocessed.csv", index=False)

#### Visualise the graph

In [10]:
def create_graph(df):
    G = nx.DiGraph()
    for row in df.itertuples(index=False):
        G.add_edge(row.head, row.tail, label=row.relation)
    return G

G = create_graph(df)

In [11]:
def plot_graph(G):
    pos = nx.spring_layout(G)  # positions for all nodes
    nx.draw(G, pos, with_labels=True, node_size=1500, node_color="skyblue", font_size=10, font_weight="bold", arrows=True)
    
    # Add edge labels
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    
    plt.title("Knowledge Graph Visualization")
    plt.show()
    
#plot_graph(G)