# Mesoscopic V2

In [1]:
### command to show execution time

%load_ext autotime

time: 0 ns


## Text processing

### Gutenberg project

In [2]:
### retrieve gutenberg book 

from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

def retrieve_text(index):
    try:
        text = strip_headers(load_etext(index)).strip()
    except:
        text = ""
    return text

def retrieve_author(index):
    return list(get_metadata('author', index))[0]

def retrieve_title(index):
    return list(get_metadata('title', index))

def retrieve_languages(index):
    return list(get_metadata('language', index))

def retrieve_subjects(index):
    return list(get_metadata('subject', index))

time: 1.86 s


### Pre processing

In [3]:
### text standardization

def standardize_text(book, cutting_flag=False):
  
    if cutting_flag:
        marker = 'chapter i.'
        flag = False
        for i in range(len(book)):
            if book[i:i+len(marker)].lower() == marker:
                if flag:
                    book = book[i:]
                    break
                else:
                    flag = True

    # removing all \r
    book = book.replace('\r', '')
    # marking all paragraph starts (\n\n) with .\r for later 
    book = book.replace('\n\n', '.\r')
    # removing all \n since they are not marking any paragraphs
    book = book.replace('\n', ' ')
    # replacing all \r with \n\n to remark the paragraphs 
    book = book.replace('\r', '\n\n')
    # replacing any possible duplicated full stops
    book = book.replace('..', '.')
    # reconstructing possible damaged ellipsis (...)
    book = book.replace('..', '...')
    # removing underscores
    book = book.replace('_', '')
  
    
    return book

time: 0 ns


In [4]:
### remove chapter markers

import re

def remove_chapter_markers(book):
    paragraphs = book.split('\n\n')
    paragraphs_new = []
    for paragraph in paragraphs:
        words = paragraph.split(' ')
        if len(words) == 1 and words[0] in ['introduction', 'book', 'preface']:
            continue
        if re.match('^\w+ [IVXLCDM\d]+[\.]*', paragraph.upper().strip()) and len(words) <= 10:
            continue
        if not re.match('.*[\w]+.*', paragraph):
            continue
        paragraphs_new.append(paragraph)
    return '\n\n'.join(paragraphs_new)

time: 0 ns


In [5]:
### solving anaphors for whole text
### better than for each paragraph, we're able to also get references from different paragraphs

import neuralcoref 
import spacy

def solve_anaphors(book, nlp):
    doc = nlp(book)

    return doc._.coref_resolved

time: 1.08 s


In [6]:
### stopwords removal

import nltk 

from nltk.corpus import stopwords

def remove_stopwords(text):
    text_list = text.split(' ')
    stop_list = stopwords.words('english')
    return ' '.join([word for word in text_list if word not in stop_list])

time: 1.53 s


### Syntatic parsing

In [7]:
### apply syntatic parsin to text to get:
###  whole text, only having removed the stopwords
###  (actor, action): root.dep_ = nsubj (actor), root.head.text = action
###  (action, object): root.dep_ = dobj (object), root.head.text = action
### in the 2 latter cases, we apply lemmatization after getting the pairs

import spacy
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def get_syntatic_pairs(node_text, nlp, lemmatizer):    
    doc = nlp(node_text)
    new_text = set()
    for chunk in doc.noun_chunks:
        if (chunk.root.dep_ in ['dobj', 'nsubj']):
            new_text.add(lemmatizer.lemmatize(chunk.root.text, pos=wordnet.NOUN) + '_' + lemmatizer.lemmatize(chunk.root.head.text, pos=wordnet.VERB))
            new_text.add(lemmatizer.lemmatize(chunk.root.text, pos=wordnet.NOUN))
            new_text.add(lemmatizer.lemmatize(chunk.root.head.text, pos=wordnet.VERB))

    return ' '.join(new_text)

def transform_text_to_syntatic_pairs(book):        
    book = book.lower()
    paragraphs = book.split('\n\n')
    nlp = spacy.load('en_core_web_lg')
    lemmatizer = WordNetLemmatizer()
    neuralcoref.add_to_pipe(nlp)
    pairs = []
    for paragraph in paragraphs:
        pair = get_syntatic_pairs(solve_anaphors(paragraph, nlp), nlp, lemmatizer)
        # WRONG: if len(pair) > 0:
        # i have to append pair even if it is empty otherwise the window will consider nodes that are 
        # actually more further apart than wanted
        pairs.append(pair)
        
    return pairs

time: 0 ns


## Network modelling

### Getting network node text

In [8]:
### joining paragraphs to reach min size of the text for a single node

def get_node_texts(pairs_per_paragraph, window_size):
    dic = {}

    for id, paragraph in enumerate(pairs_per_paragraph):
        dic[id] = '\n\n'.join(pairs_per_paragraph[max(0,id-window_size):min(len(pairs_per_paragraph), id+window_size+1)])
    
    return dic

time: 0 ns


### TF IDF

In [9]:
#### we already have the nodes, for the edges we'll use TF_IDF and cosine similarity to define which nodes to connect

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b): 
    if norm(a) * norm(b) == 0:
        return 0
    return dot(a, b)/(norm(a)*norm(b))

def get_sim_scores(dic):    
    # now i can filter by length, cause the windows are done and i will only eliminate nodes in which all of the considered
    # paragraphs have no tuples at all. 
    # to avoid causing different window size networks to have a different amount of nodes, I'll just strip the value of dic
    # making it a empty doc, but making sure it at least exists and also that that are no empty tuples considered when 
    # running tf idf
    docs = [dic[key].strip() for key in sorted(dic.keys()) if len(dic[key]) > 0]

    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(docs)

    tfidf_of_docs = []
    for row in tfIdf:
        tfidf_of_docs.append(np.squeeze(np.asarray(row.todense())))

    scores = np.zeros((len(docs), len(docs)))

    for i in range(len(docs)):
        for j in range(i+1, len(docs)):
            scores[i][j] = cos_sim(tfidf_of_docs[i], tfidf_of_docs[j])

    return scores, len(docs)

time: 0 ns


### Building graph

In [10]:
### creating graph using igraph

from igraph import *
import numpy as np 

def create_graph(scores, N, window_size):
    edges = []
    types = []
    weights = []

    for i in range(N):
        for j in range(i+1, N):
            # no overlap
            if j - i > 2*window_size: 
                if scores[i][j] > 0:
                    edges.append((i, j))
                    types.append('similarity')
                    weights.append(scores[i][j])
            elif j == i + 1:
                edges.append((i, j))
                types.append('sequence')
                weights.append(2)
    
    G = Graph(directed=False)
    G.add_vertices([i for i in range(N)])
    G.add_edges(edges)
    G.es['weight'] = weights
    G.es['type'] = types
    
    return G

time: 63 ms


## Algorithm execution

In [11]:
import pickle

def run(book_id):
    book = standardize_text(retrieve_text(book_id))
#     print(len(book))
    if len(book) == 0 : 
        print('could not retrieve', book_id)
        return
    if len(book) >= 1000000: 
        print('book is too big')
        return

    book = remove_chapter_markers(book)

    pairs_per_paragraph = transform_text_to_syntatic_pairs(book)
    
    f = open('./pca 300 networks new right/network_' + str(book_id) + '_processed_text.txt', 'w', encoding='utf-8')
    f.write('\n\n'.join(pairs_per_paragraph))
    f.close()

    window = 1
    node_dic = get_node_texts(pairs_per_paragraph, window)
    scores, N = get_sim_scores(node_dic)
    G = create_graph(scores, N, window)
    G = filterNetworkEdges([G], 20)[0]
    if G.vcount() >= 100: 
        pickle.dump(G, open('./pca 300 networks new right/network_'+str(book_id)+'_w'+str(window)+'.p', 'wb'))
        
    window = 4
    node_dic = get_node_texts(pairs_per_paragraph, window)
    scores, N = get_sim_scores(node_dic)
    G = create_graph(scores, N, window)
    G = filterNetworkEdges([G], 20)[0]
    if G.vcount() >= 100: 
        pickle.dump(G, open('./pca 300 networks new right/network_'+str(book_id)+'_w'+str(window)+'.p', 'wb'))
        
    window = 9
    node_dic = get_node_texts(pairs_per_paragraph, window)
    scores, N = get_sim_scores(node_dic)
    G = create_graph(scores, N, window)
    G = filterNetworkEdges([G], 20)[0]
    if G.vcount() >= 100: 
        pickle.dump(G, open('./pca 300 networks new right/network_'+str(book_id)+'_w'+str(window)+'.p', 'wb'))
    

time: 0 ns


### Automating running the algorithm for all 300 books selected

In [16]:
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt

method_old = '9_nsubj'
path_old = './pca 300 networks/' + method_old + '/'
path_new = './pca 300 networks new right/'
directory = os.fsencode(path_old)
    
for file in os.listdir(directory):
    filename = path_old + os.fsdecode(file)
    if filename.endswith(".p"):
        start = len(path_old) + len('network_')
        end = len('_' + method_old + '.p')
        gutenberg_id = int(filename[start:-end])
        
        if os.path.exists(path_new + 'network_' + str(gutenberg_id) + '_w9.p'):
            continue
        
        print(gutenberg_id)
        run(gutenberg_id)


28284
28675
29291
29406
29642
29764
29860
3047
3056
30576
30851
30865
30873
31029
31387
31672
31891
32090
32120
32129
32185
32250
322
32708
32931
32997
33004
33113
33195
33221
33233
33416
33437
33976
34020
34395
34488
3472
34832
35975
36531
36540
36585
36717
36728
36804
36828
36858
3695
37263
37431
37544
37681
37909
38679
38952
38960
39211
3926
39300
39372
39443
39595
39629
39747
39933
39957
40255
40340
40372
40525
40793
40966
41218
41267
41408
41524
41632
41655
41703
41777
41893
41896
42155
42320
42417
42426
4262
42630
42772
42860
42990
43218
43455
43731
43937
44428
44780
44862
45053
45214
45395
45452
45728
45926
45929
45989
46006
46276
46315
46320
46338
46403
46619
46794
46889
47242
47585
47731
47762
47923
4798
48184
48382
48702
48865
48953
48996
49324
49391
49490
49615
4965
49718
49794
4987
51022
51285
5145
51734
51913
51922
52378
52609
53945
54496
54743
54990
55002
55212
55376
55557
55658
55894
56006
56040
56321
56347
56675
57010
57308
57512
57680
57798
57989
58013
5805
58677
58718