In [1]:
"""
This script inputs the corpus and
runs several BERT functions to create
several network measures

For more information on BERT - go to https://huggingface.co/transformers/model_doc/bert.html
"""

In [4]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import torch
import string
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
import numpy as np
import datetime
import time 
import spacy
import pandas as pd
import re

#Loads the english pipeline from spacy, with english stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english')) 

import bert_functions as b_funcs

# configure the BERT network 
"""
Two models are available, one is uncased, the other one is cased, 
Change according to the importance of CASE in the sentence
- bert-base-cased: Model is case-sensitive and there is a difference between 'english' and 'English'
- bert-base-uncased: Model is case-insensitive and there is no difference between 'english' and 'English'

"""

# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# bert_model = BertForMaskedLM.from_pretrained('bert-base-cased').eval()

"""
Input network which is exported from Gephi with network information
"""

# network = pd.read_csv("MLDA Synonym.csv")
Gephi_output_file_path = "../source_data/gephi_output_cleaned.csv" 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Link network

In [9]:
"""
Reads exported file from Gephi, converts it into a dataframe
and sets the Label as the index
"""
network = pd.read_csv(Gephi_output_file_path)
network.set_index(network['Label'],inplace=True)

In [47]:
"""
Reads in the entire corpus - input file 
should contain one document per row.

Creates an index column for each document.
"""
### corpus input path to read the original text
corpus_input_csv_path = '../source_data/gme_corpus_inputs_10.csv'

df = pd.read_csv(corpus_input_csv_path).rename(columns = {"snippet":"Text"}) ### load the corpus text 
df = df.reset_index()


In [54]:
def output_predict_metrics_from_masked_words():
    """
    For every document in the corpus, runs
    BERT functions to detect
    masked words and generate network
    metrics for predicted words.

    Returns
    -------
    DataFrame,
        Consists of network measures such as degree centrality, betweenness centrality, modularity class and authority scores.
    """
    res_new = pd.DataFrame()
    for i in range(0, len(df)):
        if i%100==0:
            print(f'{i} / {len(df)} done')
        input_text = df['Text'].loc[i]
        #Creates a dataframe with network measures
        res = b_funcs.key_word_predict_with_network_from_sent(input_text,top_k=5)
        if res is not None:
            res['from_textid'] = df['index'].loc[i]
            res_new = res_new.append(res)
    res_new2 = res_new.reset_index()
    res_new2.to_csv('res.csv', index=False)
    return res_new2
          
result_df = output_predict_metrics_from_masked_words()

0 / 48782 done


In [55]:
result_df.head()

Unnamed: 0,index,prediction,cleaned_index,Label,self_auth,self_class,self_deg,self_betcent,pred_betcent,pred_auth,pred_deg,pred_class,string,from_textid
0,consistently_2,"[also, being, actually, currently, still]",consistently,consistently,0.000209,25.0,10.0,0.0,"[0.0, 0.0, 0.0, -1, 155965.9877]","[0.000391, 0.000651, 0.000391, -1, 0.000269]","[33, 125, 7, -1, 136]","[9, 3, 21, -1, 16]",Saxobank is consistently rated very poorly fro...,0
1,rated_3,"[performing, doing, functioning, working, mana...",rated,,,,,,"[-1, 174890.4158, -1, 481.203593, 47866.10398]","[-1, 0.001086, -1, 0.001485, 0.000102]","[-1, 169, -1, 19, 9]","[-1, 25, -1, 8, 9]",Saxobank is consistently rated very poorly fro...,0
2,poorly_5,"[highly, high, low, well, poorly]",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0
3,seen_10,"[written, done, read, said]",seen,,,,,,"[-1, 0.0, 35.951423, -1]","[-1, 0.002634, 1.1e-05, -1]","[-1, 18, 8, -1]","[-1, 17, 0, -1]",Saxobank is consistently rated very poorly fro...,0
4,contradicting_17,"[but, and]",contradicting,,,,,,"[6263.708295, 10269.73499]","[8.6e-05, 0.000116]","[26, 10]","[18, 8]",Saxobank is consistently rated very poorly fro...,0
