In [19]:
import pandas as pd
import pickle
import pysbd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [36]:
seg = pysbd.Segmenter(language="en", clean=False)

df = pd.read_csv('./ParlaMint-GB-lords.csv')


with (open("ParlaMint_GB_lords_embeddings.pkl", "rb")) as openfile:
    while True:
        try:
            embeddings = pickle.load(openfile)
        except EOFError:
            break

In [37]:
df.head()

Unnamed: 0,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Speaker_role,Speaker_MP,Speaker_Minister,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth,speech
0,ParlaMint-GB_2017-01-18-lords.u1,"Minutes of the House of Lords, Daily Session 2...",2017-01-18,Upper house,56,,,2017-01-18,,Reference,Regular,MP,-,LAB,Labour,Opposition,"Harris, Jonathan Toby",M,-,To ask Her Majesty’s Government what represent...
1,ParlaMint-GB_2017-01-18-lords.u2,"Minutes of the House of Lords, Daily Session 2...",2017-01-18,Upper house,56,,,2017-01-18,,Reference,Regular,MP,-,CON,Conservative,,"Williams, Susan",F,-,"My Lords, Her Majesty’s Passport Office has be..."
2,ParlaMint-GB_2017-01-18-lords.u3,"Minutes of the House of Lords, Daily Session 2...",2017-01-18,Upper house,56,,,2017-01-18,,Reference,Regular,MP,-,LAB,Labour,Opposition,"Harris, Jonathan Toby",M,-,"My Lords, I am grateful to the Minister for th..."
3,ParlaMint-GB_2017-01-18-lords.u4,"Minutes of the House of Lords, Daily Session 2...",2017-01-18,Upper house,56,,,2017-01-18,,Reference,Regular,MP,-,CON,Conservative,,"Williams, Susan",F,-,The noble Lord is absolutely right that securi...
4,ParlaMint-GB_2017-01-18-lords.u5,"Minutes of the House of Lords, Daily Session 2...",2017-01-18,Upper house,56,,,2017-01-18,,Reference,Regular,MP,-,LAB,Labour,Opposition,"Campbell-Savours, Dale Norman",M,-,Does this Question not take us straight back t...


In [38]:
len(embeddings)

198130

In [39]:
len(df)

198130

In [40]:
def average_cosine_similarity(embeddings):
    num_lists = len(embeddings)
    all_similarities = []
    
    if num_lists == 1:
        return None

    for i, vector in enumerate(embeddings):
        vector = np.array(vector).reshape(1, -1)  # Reshape to 2D array
        
        if i == 0:
            other_vectors = embeddings[1:]
        elif i == len(embeddings) - 1:
            other_vectors = embeddings[:-1]
        else:
            other_vectors = np.concatenate((embeddings[:i], embeddings[i+1:]), axis=0)
        
        similarities = cosine_similarity(vector, other_vectors)
        average_similarity = np.mean(similarities)
        
        all_similarities.append(average_similarity)
        
    all_similarities = all_similarities / sum(all_similarities)
        
    return np.asarray(all_similarities)

In [41]:
def aggregate_embedding(speech_id):
    
    speech_embedding = embeddings[speech_id]
    
    sentence_weights = average_cosine_similarity(speech_embedding)
    
    if type(sentence_weights) != list:
        return speech_embedding[0]
    
    else:
        sentence_weights = np.array(sentence_weights).reshape(-1, 1)
        return sum(speech_embedding * sentence_weights)

In [42]:
aggregated_embeddings = {}
for speech_id in tqdm(list(df.ID)):
    aggregated_embeddings[speech_id] = aggregate_embedding(speech_id)

100%|██████████████████████████████████████████████████████████████████████████| 198130/198130 [07:29<00:00, 440.70it/s]


In [43]:
with open('ParlaMint_GB_lords_aggregated_embeddings.pkl', 'wb') as handle:
    pickle.dump(aggregated_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)