In [2]:
import pandas as pd
from tqdm import tqdm
from neo4j import GraphDatabase
import json
import re
import os
import glob

In [4]:

path = "./"
all_files = glob.glob(os.path.join(path, "./Datasets/api_comic_list*.csv"))
df_from_each_file = (pd.read_csv(f, sep=',') for f in all_files)
df_merged   = pd.concat(df_from_each_file, ignore_index=True)
df_merged.to_csv( "./Datasets/merged.csv")

In [3]:
db_host = "bolt://localhost:7687"
password = "1234qwer"


In [3]:
api_characters = pd.read_csv("./Datasets/api_character_list.csv")
scraping_characters = pd.read_csv(
    "./Datasets/scraping_personaggi_completo_filtered_relations.csv")
api_comics = pd.read_csv("./Datasets/merged.csv")


In [None]:
scraping_characters.head(20)

In [None]:
api_characters.head(20)

Add characters nodes with variant from API

In [10]:
def add_comic_node(tx, comic_title,comic_id, comic_description):
  comic_description = str(comic_description).replace(
      "'", " ").replace('"', "").replace("-", " ").strip()
  create_comic_node_query = 'MERGE (c:comic {title:"%s", comic_id: %s, comic_description: "%s"})' % (
      comic_title, comic_id, comic_description)
  node_creation_result = tx.run(create_comic_node_query)


In [5]:
driver = GraphDatabase.driver(db_host, auth=("neo4j",password))

# Aggiunta personaggi ottenuti dalla web api

In [8]:
def add_character_node(tx, name, character_id, character_description):
  processed_name = name.replace('"', " ").replace('"'," ")
  nome_senza_parentesi = re.sub("[\(\[].*?[\)\]]", "", processed_name.strip())
  query_string = 'MERGE (c:character{name: "%s"})' %(nome_senza_parentesi.replace("'", " ").lower().strip())
  tx.run(query_string)
  tx.run(
      "MERGE (n:character_variant {name: '%s', character_id: %s, character_description: '%s'})" %(name.replace("'", ' ').replace('"', " ").replace("-", " "), character_id, str(character_description).replace("'", ' ').replace('"', " ").replace("-", " ")))
  query_variant = 'MATCH (a:character{name:"%s"}),(b:character_variant {name: "%s", character_id: %s, character_description: "%s"}) MERGE (b)-[r:variante_di]->(a)' % (
      nome_senza_parentesi.replace("'", " ").lower().strip(), name.replace("'", ' ').replace('"', " ").replace("-", " ").strip(), character_id, str(character_description).replace("'", ' ').replace('"', " ").replace("-", " ").strip(),)
  tx.run(query_variant)


In [65]:

for idx, row in tqdm(api_characters.iterrows(), total=api_characters.shape[0]):
    with driver.session() as session:
        try:
            session.write_transaction(
                add_character_node, row['name'], row["id"], row["description"])
        except Exception as e:
            print(e
            )

100%|██████████| 1559/1559 [01:02<00:00, 24.76it/s]


# Inserimento fumetti ottenuti dalla web api e collegamento con gli eventuali personaggi

In [9]:
def add_comic_node_and_link(tx, comic_title, comic_id, character_id, comic_description,  format, page_count, isbn, upc, diamond_code,ean, issn):
  comic_description = str(comic_description).replace("'", " ").replace('"',"").replace("-"," ").strip()
  create_comic_node_query = 'MERGE (c:comic {title:"%s", comic_id: %s, comic_description: "%s", format : "%s", page_count : %s, isbn: "%s", upc: "%s", diamond_code: "%s",ean:"%s", issn: "%s" })' % (
      comic_title, comic_id, comic_description, format, page_count, isbn, upc, diamond_code,ean, issn)
  create_link_query = "MATCH (a:character_variant),(b:comic) WHERE a.character_id = %s AND b.comic_id = %s MERGE (a)-[r:Presente]->(b)" % (
      character_id, comic_id)
  # print(create_link_query)
  node_creation_result = tx.run(create_comic_node_query)
  link_creation_result = tx.run(create_link_query)


In [8]:
def update_comic(tx, comic_id, format, page_count, isbn, upc, diamond_code,ean, issn):
  tx.run("MATCH (c:comic) WHERE c.comic_id = %s SET c.format = '%s', c.page_count = %s, c.isbn= '%s', c.upc= '%s', c.diamond_code= '%s', c.ean='%s', c.issn= '%s'" %
         (comic_id, format, page_count, isbn, upc, diamond_code,ean, issn))


In [9]:
with driver.session() as session:
  for idx, row in tqdm(api_comics.iterrows(), total=api_comics.shape[0]):
    try:
        
        session.write_transaction(
            update_comic,  row["id"],  str(row["format"]), str(row["pageCount"]), str(row["isbn"]), str(row["upc"]), str(row["diamondCode"]), str(row["ean"]), str(row["issn"]))
    except Exception as e:
        print(e)



100%|██████████| 50645/50645 [1:11:46<00:00, 11.76it/s]  


In [None]:
for idx, row in tqdm(api_comics.iterrows(), total=api_comics.shape[0]):
    string = row["characters"]
    
    characters = json.loads(string)
    if characters["items"]:
        for character in characters["items"]:
            with driver.session() as session:
                try:
                    character_id = character["resourceURI"].split("/")
                    character_id = character_id[len(character_id) -1]
                    session.write_transaction(
                            add_comic_node_and_link, row['title'], row["id"], character_id,  row["description"])
                except Exception as e:
                    print(e)
    else:
        with driver.session() as session:
            try:
                session.write_transaction(
                    add_comic_node, row['title'], row["id"],  row["description"])
            except Exception as e:
                print(e)

# Inserimento personaggi da web scraping

Se un personaggio base è già presente, quindi era presente nella lista personaggi ottenuta dalla web api, viene creata o aggiornata la variante del personaggio, se invece non è presente viene creato un personaggio base e la corrispondente variante

In [5]:
def character_present(tx, character):
  res = tx.run("Match (n:character) where n.name='%s' return n"%(character))
  if res.single():
    return True
  else:
    return False


In [9]:
def add_character_node(tx, name ):
    create_comic_node_query = 'MERGE (c:character {name:"%s", wiki: true })' %(name)
    tx.run(create_comic_node_query)

def add_variant_node(tx, name, variant_name, biography):
    tx.run('MERGE (b:character_variant {name: "%s"}) SET b.biography = "%s", b.wiki = true'%(variant_name, str(biography).replace("'","\\'").replace('"','\\"').replace("-","\\-").strip()))
    query_variant = 'MATCH (a:character{name:"%s"}),(b:character_variant {name: "%s", wiki: true}) MERGE (b)-[r:variante_di]->(a)' % (
       name, variant_name)
    tx.run(query_variant)


In [None]:
driver = GraphDatabase.driver(db_host, auth=("neo4j",password))
num_present = 0
with driver.session() as session:
    try:
        character_data = pd.read_csv(
            "./Datasets/scraping_personaggi_completo_filtered_relations.csv")
        for idx, row in tqdm(character_data.iterrows(), total=character_data.shape[0]):
            name = row["Nome"]
            name = name.replace('"', " ").replace("'", " ")
            nome_senza_parentesi = re.sub("[\(\[].*?[\)\]]", "", name.strip())
            nome_senza_parentesi = nome_senza_parentesi.strip().lower()
            present = session.write_transaction(
                character_present, nome_senza_parentesi)
            if present:
                num_present +=1
                session.write_transaction(add_variant_node, nome_senza_parentesi, name, row["Biografia"])
            else:
                session.write_transaction(add_character_node, nome_senza_parentesi)
                session.write_transaction(
                    add_variant_node, nome_senza_parentesi, name, row["Biografia"])
    except Exception as e:
        print(e
        )
print(num_present)

In [5]:
driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
num_present = 0
with driver.session() as session:
    try:
        character_data = pd.read_csv(
            "./Datasets/scraping_personaggi_completo_filtered_relations.csv")
        for idx, row in tqdm(character_data.iterrows(), total=character_data.shape[0]):
          name = row["Nome"]
          name = name.replace('"', " ").replace("'", " ")
          session.run("MATCH (n:character_variant) WHERE n.name = '%s' SET n.biography = '%s'" % (
              name, row["Processed_Biography"]))
    except Exception as e:
        print(e)


100%|██████████| 3746/3746 [02:44<00:00, 22.82it/s]


# Inserimento dati film
Inserimento dei dati nel database, i personaggi che hanno film nei dati dei personaggi sono collegati ai rispettivi nodi film

In [21]:
#charachter_name must have " and ' replaced with whytespace and should be trimmed
#movie_title must have " and ' replaced with whytespace and should be trimmed
def add_movie_node_and_link_character(tx, movie_title, character_name):
  #create relation between movie anc character if mot exists
  tx.run("MATCH (m:movie), (c:character_variant) WHERE m.title = '%s' AND c.name = '%s' MERGE (c)-[r:in_film]->(m)"%(movie_title, character_name))

#movie_title must have " and ' and - replaced with whytespace and should be trimmed
def add_movie(tx, movie_title, release_date, runtime, box_office, synopsis):
  #create movie node if not exists
  tx.run("MERGE (n:movie {title: '%s', release_date: '%s', screen_time: '%s', box_office: '%s', synopsis: '%s' })" % (movie_title, release_date, runtime, box_office, synopsis))
  

In [19]:
import numpy as np
driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
num_present = 0
with driver.session() as session:
    try:
        film_data = pd.read_csv(
            "./Datasets/film_data_scraping.csv")
        for idx, row in tqdm(film_data.iterrows(), total=film_data.shape[0]):
          
            film_title = row["Title"].strip() if str(row["Title"]) != "nan" else None
            film_release_date = row["releaseDate"].strip(
            ) if str(row["releaseDate"]) != "nan" else None
            film_runtime = row["Runtime"].strip() if str(
                row["Runtime"]) != "nan" else None
            film_box_office = row["BoxOffice"].strip(
            ) if str(row["BoxOffice"]) != "nan" else None
            film_synopsis = json.loads(row["Synopsis"])[
                0] if str(row["Synopsis"]) != "nan" else None
            #remove illegal string characters
            if film_title:
                film_title = film_title.replace('"', " ").replace("'", " ").replace("-"," ").strip()
            #remove useless characters from synopsis
            if film_synopsis:
                film_synopsis = film_synopsis.replace("-","\-").replace("'","\\'").replace('"','\\"').strip()
            session.write_transaction(
                    add_movie, film_title, film_release_date, film_runtime, film_box_office, film_synopsis)
            
    except Exception as e:
        print(e)


100%|██████████| 42/42 [00:04<00:00, 10.24it/s]

0





### Creazione relazioni fra personaggi e film

In [22]:
driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
num_present = 0
with driver.session() as session:
    try:
        character_data = pd.read_csv(
            "./Datasets/scraping_personaggi_completo_filtered_relations.csv")
        for idx, row in tqdm(character_data.iterrows(), total=character_data.shape[0]):
            films = json.loads(row["Film"])
            name = row["Nome"]
            name = name.replace('"', " ").replace("'", " ")
            for film in films:
                film_name = film[0]
                film_name = film_name.replace('"', " ").replace(
                    "'", " ").replace("-", " ").strip()
                session.write_transaction(add_movie_node_and_link_character, film_name, name)           
    except Exception as e:
        print(e)


100%|██████████| 3746/3746 [00:56<00:00, 66.25it/s] 


# Aggiunta delle serie nel database


Creazione per ogni serie di un nodo di tipo *serie* contenente le varie info ottenute e collegamento ai nodi *character variant* corrispondenti

---

### Definizione delle funzioni
Definizione delle funzioni utilizzate per l'inserimento nel database.

In [32]:
import re
def escape_for_neo4j(string):
  return string.replace('"', '\\"').replace("'", "\\'").replace("-", "\\-").strip()
def create_node_tv_serie(tx, title, creators, showrunners, producers, composers, release_date):
  creators_list = list(map(lambda item: re.escape(item[0]).replace("\ "," ").strip(), creators))
  showrunners_list = list(
      map(lambda item: escape_for_neo4j(item[0]).strip(), showrunners))
  producers_list = list(
      map(lambda item: escape_for_neo4j(item[0]).strip(), producers))
  composers_list = list(
      map(lambda item: escape_for_neo4j(item[0]).strip(), composers))
  #remove illegal characters 
  title = escape_for_neo4j(title)
  release_date = escape_for_neo4j(release_date)
  #prepare query
  query_string = "MERGE (n:tv_show {title:'%s', creators:%s, showrunners: %s, producers: %s, composers: %s, release_date: '%s'})"%(title, creators_list, showrunners_list, producers_list,composers_list, release_date)
  tx.run(query_string)
#name = name.replace('"', " ").replace("'", " ")
def link_character_and_series(tx, character_name, serie_name):
  character_name = character_name.replace('"', " ").replace("'", " ")
  serie_name = escape_for_neo4j(serie_name)
  query_string = "MATCH (c:character_variant), (s:tv_show) WHERE c.name = '%s' AND s.title = '%s' MERGE (c)-[r:in_serie]->(s)"%(character_name, serie_name)
  tx.run(query_string)


### Add series nodes

In [33]:
tv_series_dataframe = pd.read_csv("./Datasets/tv_series.csv")

driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
with driver.session() as session:
  try:
    for idx, row in tqdm(tv_series_dataframe.iterrows(), total=tv_series_dataframe.shape[0]):
      try:
        title = row["Title"]
        creators = json.loads(row["Creators"])
        Showrunners = json.loads(row["Showrunners"])
        Producers = json.loads(row["Producers"])
        Composers = json.loads(row["Composers"])
        release_date = row["releaseDate"]
        session.write_transaction(create_node_tv_serie,title, creators, Showrunners, Producers, Composers, str(release_date))
      except Exception as e:
        print(e)
  except Exception as e:
    print(e)


100%|██████████| 19/19 [00:02<00:00,  6.79it/s]


### link characters with series

In [None]:
characters_dataframe = pd.read_csv(
    "./Datasets/scraping_personaggi_completo_filtered_relations.csv")

driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
error_log = []
with driver.session() as session:
  try:
    for idx, row in tqdm(characters_dataframe.iterrows(), total=characters_dataframe.shape[0]):
      try:
        series = json.loads(row["Serie"])
        character_name = row["Nome"]
        series = list(map(lambda item: escape_for_neo4j(item[0]), series))
        for item in series:
          session.write_transaction(link_character_and_series, character_name, item)
      except Exception as e:
        error_log.append(e)
  except Exception as e:
    print(e)
print(error_log)


## Link characters using relations found on wiki

In [16]:
def create_relation(tx, first_character, second_character, relation_type):
  query_string = "MATCH (c:character_variant),(o:character_variant) WHERE c.name = '%s' AND o.name = '%s' MERGE (c)-[r:knows {type: '%s'}]->(o)"%(first_character, second_character, relation_type)
  tx.run(query_string)
  

In [18]:
characters_dataframe = pd.read_csv(
    "./Datasets/scraping_personaggi_completo_filtered_relations.csv")
characters_dataframe.head(20)
driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
import numpy as np
#†
with driver.session() as session:
    try:
        for idx, row in tqdm(characters_dataframe.iterrows(), total=characters_dataframe.shape[0]):
            relazioni = json.loads(row["Filtered_relazioni"] if str(row["Filtered_relazioni"]) != "nan" else "[]")
            for relation in relazioni:
                
                name = relation[0].replace("†", "").replace(
                    '"', " ").replace("'", " ").strip()
                splitted = name.split("/")
                for splitted_relation in splitted:
                    first_name = row["Nome"].replace("†", "").replace(
                        '"', " ").replace("'", " ").strip()
                    rel_type = relation[1] if len(relation) == 2 else "" 
                    session.write_transaction(
                        create_relation, first_name, splitted_relation, rel_type)
    except Exception as e:
        print(e)


100%|██████████| 3746/3746 [00:11<00:00, 322.25it/s]


In [None]:
def test(tx):
  res =  tx.run("match (n:character_variant {wiki: true}) return n limit 1000")
  for result in res:
    print(result)
driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
with driver.session() as session:
  session.write_transaction(test)