In [29]:
import pandas as pd
import re

# -------------------------------------------------------------------
# 1. PARSE DEL FILE amazon-meta.txt PER ESTRARRE Id, title, group
# -------------------------------------------------------------------

meta_file = "/Users/michelebaldo/Desktop/amazon-meta.txt"

records = []
current = {}

with open(meta_file, "r", encoding="latin1") as f:
    for line in f:
        line = line.strip()

        # Nuovo record
        if line.startswith("Id:"):
            # Salva il record precedente
            if "Id" in current:
                records.append(current)
            current = {"Id": int(line.split()[1])}

        # ASIN
        elif line.startswith("ASIN:"):
            current["ASIN"] = line.replace("ASIN:", "").strip()

        # title
        elif line.startswith("title:"):
            current["title"] = line.replace("title:", "").strip()

        # group
        elif line.startswith("group:"):
            current["group"] = line.replace("group:", "").strip()

# Aggiungi l’ultimo record
if "Id" in current:
    records.append(current)

meta_df = pd.DataFrame(records)

print("Metadati caricati:", meta_df.shape)
print(meta_df.head())

# -------------------------------------------------------------------
# 2. CARICAMENTO ARCHI DA Amazon0505.csv
# -------------------------------------------------------------------

edges = pd.read_csv("/Users/michelebaldo/Desktop/Amazon0505.csv", comment="#", sep="\t",
                    names=["source", "target"])

print("Archi caricati:", edges.shape)
print(edges.head())

# -------------------------------------------------------------------
# 3. FILTRA I NODI PRESENTI NEGLI ARCHI
# -------------------------------------------------------------------

used_ids = set(edges["source"]).union(set(edges["target"]))
meta_df = meta_df[meta_df["Id"].isin(used_ids)]

print("Nodi usati nel grafo:", meta_df.shape)

# -------------------------------------------------------------------
# 4. PREPARA NODES.CSV PER GEPHI
# id = Id
# label = title (se manca → ASIN)
# group = categoria
# -------------------------------------------------------------------

meta_df["label"] = meta_df["title"].fillna(meta_df["ASIN"])
nodes = meta_df[["Id", "label", "group"]]
nodes.columns = ["id", "label", "group"]

# -------------------------------------------------------------------
# 5. PREPARA EDGES.CSV PER GEPHI
# -------------------------------------------------------------------

edges = edges[edges["source"].isin(nodes["id"]) &
              edges["target"].isin(nodes["id"])]

edges.columns = ["source", "target"]

# -------------------------------------------------------------------
# 6. ESPORTA I FILE FINALI
# -------------------------------------------------------------------

nodes.to_csv("nodes2.csv", index=False)
edges.to_csv("edges2.csv", index=False)

print("nodes.csv e edges.csv creati correttamente!")


Metadati caricati: (548552, 4)
   Id        ASIN                                              title group
0   0  0771044445                                                NaN   NaN
1   1  0827229534            Patterns of Preaching: A Sermon Sampler  Book
2   2  0738700797                         Candlemas: Feast of Flames  Book
3   3  0486287785   World War II Allied Fighter Planes Trading Cards  Book
4   4  0842328327  Life Application Bible Commentary: 1 and 2 Tim...  Book
Archi caricati: (3356824, 2)
   source  target
0       0       1
1       0       2
2       0       3
3       0       4
4       0       5
Nodi usati nel grafo: (410236, 4)
nodes.csv e edges.csv creati correttamente!


In [30]:
import pandas as pd

# Carica i file
nodes = pd.read_csv("/Users/michelebaldo/Desktop/nodes2.csv")
edges = pd.read_csv("/Users/michelebaldo/Desktop/edges2.csv")

# -------------------------------------------------------------------
# 1. Rimuovi il nodo con id = 0 da nodes.csv
# -------------------------------------------------------------------

nodes_clean = nodes[nodes["id"] != 0]

# -------------------------------------------------------------------
# 2. Rimuovi tutti gli archi che coinvolgono il nodo 0
# -------------------------------------------------------------------

edges_clean = edges[(edges["source"] != 0) & (edges["target"] != 0)]

# -------------------------------------------------------------------
# 3. Salva i nuovi file puliti
# -------------------------------------------------------------------

nodes_clean.to_csv("nodes_clean.csv", index=False)
edges_clean.to_csv("edges_clean.csv", index=False)

print("Pulizia completata! Salvati nodes_clean.csv e edges_clean.csv")


Pulizia completata! Salvati nodes_clean.csv e edges_clean.csv


In [31]:
import pandas as pd

# -------------------------------------------------------------------
# 1. Caricare i file originali puliti
# -------------------------------------------------------------------
nodes = pd.read_csv("/Users/michelebaldo/Desktop/nodes_clean.csv")
edges = pd.read_csv("/Users/michelebaldo/Desktop/edges_clean.csv")

# -------------------------------------------------------------------
# 2. Calcolare il degree totale (in-degree + out-degree)
# -------------------------------------------------------------------

# Degree = quante volte un nodo compare come source o target
out_degree = edges.groupby("source").size().rename("out_degree")
in_degree = edges.groupby("target").size().rename("in_degree")

degree = pd.concat([in_degree, out_degree], axis=1).fillna(0)
degree["total_degree"] = degree["in_degree"] + degree["out_degree"]

# Unire il degree ai nodi
nodes_deg = nodes.merge(degree, left_on="id", right_index=True, how="left")
nodes_deg = nodes_deg.fillna(0)

# -------------------------------------------------------------------
# 3. Selezionare i top 100 nodi per degree
# -------------------------------------------------------------------
top100 = nodes_deg.sort_values("total_degree", ascending=False).head(100)

top100_ids = set(top100["id"])

# -------------------------------------------------------------------
# 4. Filtrare gli archi che coinvolgono solo questi 100 nodi
# -------------------------------------------------------------------
edges_top100 = edges[
    edges["source"].isin(top100_ids) & edges["target"].isin(top100_ids)
]

# -------------------------------------------------------------------
# 5. Esportare i file pronti per Gephi
# -------------------------------------------------------------------
top100.to_csv("nodes_top100.csv", index=False)
edges_top100.to_csv("edges_top100.csv", index=False)

print("Creati correttamente: nodes_top100.csv e edges_top100.csv")
print("Nodi selezionati:", len(top100_ids))
print("Archi filtrati:", len(edges_top100))


Creati correttamente: nodes_top100.csv e edges_top100.csv
Nodi selezionati: 100
Archi filtrati: 251


In [32]:
import pandas as pd

# Carica il file che vuoi ordinare
nodes = pd.read_csv("/Users/michelebaldo/Desktop/nodes_top100.csv")   # <-- cambia il nome se serve

# Ordina per id crescente
nodes_sorted = nodes.sort_values(by="id", ascending=True)

# Salva il file ordinato
nodes_sorted.to_csv("nodes_top100_sorted.csv", index=False)

print("File ordinato creato: nodes_top100_sorted.csv")


File ordinato creato: nodes_top100_sorted.csv


In [33]:
edges = pd.read_csv("/Users/michelebaldo/Desktop/edges_top100.csv")

edges_sorted = edges.sort_values(by=["source", "target"], ascending=True)

edges_sorted.to_csv("edges_top100_sorted.csv", index=False)

print("File edges ordinato creato: edges_top100_sorted.csv")


File edges ordinato creato: edges_top100_sorted.csv


In [34]:
import pandas as pd

# Caricamento dei file top100
nodes = pd.read_csv("/Users/michelebaldo/Desktop/nodes_top100.csv")
edges = pd.read_csv("/Users/michelebaldo/Desktop/edges_top100.csv")

# Creiamo un dizionario {id : total_degree}
degree_map = dict(zip(nodes["id"], nodes["total_degree"]))

# Calcoliamo il peso artificiale dell'arco
edges["weight"] = edges.apply(
    lambda row: degree_map[row["source"]] + degree_map[row["target"]],
    axis=1
)

# Salviamo il nuovo file
edges.to_csv("edges_top100_weighted.csv", index=False)

print("Creato edges_top100_weighted.csv con colonna 'weight'.")


Creato edges_top100_weighted.csv con colonna 'weight'.
