In [None]:
import pandas as pd
import spacy
import os
import networkx as nx
from collections import defaultdict
import csv
from tqdm import tqdm 
import re
import matplotlib.pyplot as plt
from spacy.pipeline import EntityRuler
from shapely.geometry import Point, LineString

In [None]:
nlp = spacy.load("en_core_web_trf") 

In [None]:
#load the cleaned data and get an overview
docs = []

directory= 'data4/clean/'

whole = ""

for filename in os.listdir(directory):
    if filename.endswith('.txt'):  
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            whole += content + "\n\n"  


print(whole)  
nlp.max_length = len(whole) + 100 

In [None]:
#load the street names in Manchester extrated from GB1900   
file_path = './manchester_data.csv'
df = pd.read_csv(file_path)


street_names = df['final_text'].dropna().unique()


whole_text = whole


def is_valid_street_name(name):
    if len(name) > 5 and len(name) < 50:
        if re.match("^[A-Za-z0-9 .,'&-]+$", name):
            return True
    return False

valid_street_names = [name for name in street_names if is_valid_street_name(name)]

In [None]:
#build the network of George Thompson with the stree names in GB1900 and person's names
exclude_streets = set(['Library', 'Temple'])
valid_street_set = {street for street in valid_street_names if street not in exclude_streets}

directory = 'data9/clean/'
G = nx.Graph() 

street_person_map = {}

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            paragraphs = content.split('\n\n')

            for paragraph_text in paragraphs:
                paragraph_doc = nlp(paragraph_text)
                sents = list(paragraph_doc.sents)

                local_streets = {}
                for ent in paragraph_doc.ents:
                    if ent.text in valid_street_set:
                        if ent.text not in G:
                            G.add_node(ent.text, type='street')
                        if ent.text not in street_person_map:
                            street_person_map[ent.text] = set()
                        local_streets[ent.text] = set()

                for i, sent in enumerate(sents):
                    if "george thompson" in sent.text.lower():
                        start_index = max(0, i - 3)
                        end_index = min(len(sents), i + 4)
                        for j in range(start_index, end_index):
                            for ent in sents[j].ents:
                                if ent.label_ == 'PERSON' and ent.text != "George Thompson":
                                    for street in local_streets.keys():
                                        local_streets[street].add(ent.text)

                
                for street, persons in local_streets.items():
                    street_person_map[street].update(persons)


for street1, people1 in street_person_map.items():
    for street2, people2 in street_person_map.items():
        if street1 != street2:
            shared_people_count = len(people1.intersection(people2))
            if shared_people_count > 0:
                if G.has_edge(street1, street2):
                    G[street1][street2]['weight'] += shared_people_count
                else:
                    G.add_edge(street1, street2, weight=shared_people_count)

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [None]:
#have an overview of the network
output_path = 'georgelocation.gexf'
nx.write_gexf(G, output_path)
print(f"Graph has been saved to {output_path}.")

In [None]:
#add coordinate in GB1900 into the nodes
file_path = './manchester_data.csv'
data = pd.read_csv(file_path)
for index, row in data.iterrows():
    street_name = row['final_text'] 
    if street_name in G:
        
        G.nodes[street_name]['latitude'] = row['latitude']
        G.nodes[street_name]['longitude'] = row['longitude']


In [None]:
#Calculate the degree_centrality and the weight of the edges
degree_centrality = nx.degree_centrality(G)
nx.set_node_attributes(G, degree_centrality, 'degree_centrality')


node_data = {
    'geometry': [],
    'size': [],
    'degree_centrality': []  
}
for node, attr in G.nodes(data=True):
    if 'latitude' in attr and 'longitude' in attr:
        point = Point(attr['longitude'], attr['latitude'])
        node_data['geometry'].append(point)
        node_data['size'].append(attr.get('size', 1))  
        node_data['degree_centrality'].append(attr.get('degree_centrality', 0)) 

nodes_gdf = gpd.GeoDataFrame(node_data, crs="EPSG:4326")


edge_data = {
    'geometry': [],
    'weight': []
}
for u, v, data in G.edges(data=True):
    if 'latitude' in G.nodes[u] and 'longitude' in G.nodes[u] and 'latitude' in G.nodes[v] and 'longitude' in G.nodes[v]:
        line = LineString([(G.nodes[u]['longitude'], G.nodes[u]['latitude']),
                           (G.nodes[v]['longitude'], G.nodes[v]['latitude'])])
        edge_data['geometry'].append(line)
        edge_data['weight'].append(data.get('weight', 1))  

edges_gdf = gpd.GeoDataFrame(edge_data, crs="EPSG:4326")


output_path = './ditu/' 

nodes_gdf.to_file(f"{output_path}network_nodes1.shp")
edges_gdf.to_file(f"{output_path}network_edges1.shp")

print("Shapefiles have been saved successfully.")