In [1]:
import pandas as pd
import pandas as pd
import torch
import pickle
import os

import nlpsig
from nlpsig.ffn import FeedforwardNeuralNetModel
from nlpsig.focal_loss import FocalLoss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def concatenate_data(data_folder_path):
    print(f"looking in {data_folder_path} directory...")
    manifesto_dfs = []
    for filename in os.listdir(data_folder_path):
        print(f"- reading {filename}...")
        # parse filename for metadata
        filename_split = filename.split("_")
        party_id = int(filename_split[0])
        year = int(filename_split[1][0:4])
        month = int(filename_split[1][4:6])
        doc_id = f"{party_id}_{year}"
        # read dataframe and add metadata
        df = pd.read_csv(f"data/{filename}")[["text", "cmp_code"]]
        df = df[df["cmp_code"]!="H"].dropna().reset_index(drop=True)
        df["topic"] = [int(str(code)[0]) for code in df["cmp_code"]]
        df["switched_topic"] = [True] + [df["topic"].iloc[i]!=df["topic"].iloc[i-1]
                                         for i in range(1, len(df))]
        df["party_id"] = party_id
        df["doc_id"] = f"{party_id}_{year}"
        df["datetime"] = pd.Timestamp(f"{year}-{month}")
        manifesto_dfs.append(df)
    return pd.concat(manifesto_dfs).reset_index(drop=True)

In [3]:
manifesto_df = concatenate_data("data/")

looking in data/ directory...
- reading 51902_201706.csv...
- reading 51902_201505.csv...
- reading 51320_201912.csv...
- reading 51620_201706.csv...
- reading 51620_201505.csv...
- reading 51421_201706.csv...
- reading 51421_201505.csv...
- reading 51421_201912.csv...
- reading 51902_201912.csv...
- reading 51320_201706.csv...
- reading 51620_201912.csv...
- reading 51320_201505.csv...


In [4]:
manifesto_df.head(10)

Unnamed: 0,text,cmp_code,topic,switched_topic,party_id,doc_id,datetime
0,SNP MPs have used their influence to deliver p...,305.1,3,True,51902,51902_2017,2017-06-01
1,Here’s just some of what a strong team of SNP ...,305.1,3,False,51902,51902_2017,2017-06-01
2,When the Scotland Bill was going through Westm...,301.0,3,False,51902,51902_2017,2017-06-01
3,"And it was SNP MPs, working with the Scottish ...",301.0,3,False,51902,51902_2017,2017-06-01
4,The SNP secured a deal that ensures Scotland w...,301.0,3,False,51902,51902_2017,2017-06-01
5,SNP MPs have consistently opposed Tory austerity.,504.0,5,True,51902,51902_2017,2017-06-01
6,Our MPs have been instrumental in forcing UK g...,504.0,5,False,51902,51902_2017,2017-06-01
7,Alison Thewliss has been at the forefront of t...,504.0,5,False,51902,51902_2017,2017-06-01
8,and force women to prove they have been raped ...,503.0,5,False,51902,51902_2017,2017-06-01
9,SNP MPs have worked with Women Against State P...,503.0,5,False,51902,51902_2017,2017-06-01


In [5]:
manifesto_df["topic"].value_counts()

5    5300
4    3872
6    1547
1    1504
2    1235
3    1101
7    1024
0      27
Name: topic, dtype: int64

In [6]:
manifesto_df["party_id"].value_counts()

51421    4515
51620    4307
51320    4039
51902    2749
Name: party_id, dtype: int64

In [7]:
manifesto_df["doc_id"].value_counts()

51421_2015    1917
51320_2019    1702
51620_2015    1588
51620_2017    1496
51421_2019    1467
51320_2017    1328
51620_2019    1223
51421_2017    1131
51902_2019    1071
51320_2015    1009
51902_2015     892
51902_2017     786
Name: doc_id, dtype: int64

## Model specifics

Nested dictionary for models specifications.

This includes models for encoding text, path signature and etc.

In [8]:
model_specifics = {
    "encoder_args": {
        "col_name_text": "text", # column corresponding to the sentences
        "model_name": "all-MiniLM-L6-v2", #options: all-mpnet-base-v2, all-distilroberta-v1, all-MiniLM-L12-v2
        "model_args": {
            "batch_size": 64,
            "show_progress_bar": True,
            "output_value": 'sentence_embedding', 
            "convert_to_numpy": True,
            "convert_to_tensor": False,
            "device": None,
            "normalize_embeddings": False
        }
    },
    "dim_reduction": {
        "method": 'umap', #options: ppapca, ppapcappa, umap
        "n_components": 10, # options: any int number between 1 and embedding dimensions
    },
    "embedding":{
        "global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
        "post_embedding_tp": 'sentence', #options: sentence, reduced
        "feature_combination_method": 'attention', #options concatenation, attention 
    },
    "time_injection": {
        "history_tp": 'timestamp', #options: timestamp, None
        "post_tp": 'timestamp', #options: timestamp, timediff, None
    },
    "signature": {
        "dimensions": 3, #options: any int number larger than 1
        "method": 'log', # options: log, sig
        "interval": 1/12
    }
}

## Obtaining SBERT Embeddings

We can use the `TextEncoder` class within `nlpsig` to obtain sentence embeddings from a model. Here, we have defined the encoder arguments in `model_specifics`.

In [9]:
model_specifics["encoder_args"]

{'col_name_text': 'text',
 'model_name': 'all-MiniLM-L6-v2',
 'model_args': {'batch_size': 64,
  'show_progress_bar': True,
  'output_value': 'sentence_embedding',
  'convert_to_numpy': True,
  'convert_to_tensor': False,
  'device': None,
  'normalize_embeddings': False}}

We can pass these into the constructor of the class to initialise our text encoder as follows:

In [10]:
# initialise the Text Encoder 
text_encoder = nlpsig.TextEncoder(manifesto_df, **model_specifics["encoder_args"])

The class has a `.encode_sentence_transformer()` method which first loads in the model (using the `model_name` and `model_args` attributes) and then obtains an embedding for each sentence. These sentence embeddings are then stored in the `embeddings_sentence` attribute of the object.

In [11]:
text_encoder.encode_sentence_transformer()
embeddings_sentence = text_encoder.embeddings_sentence

[INFO] number of sentences to encode: 15610


Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 244/244 [00:35<00:00,  6.80it/s]


## Dimensionality Reduction with UMAP

Here we specified our choices in `model_specifics` above:

In [12]:
model_specifics["dim_reduction"]

{'method': 'umap', 'n_components': 10}

In [13]:
reduction = nlpsig.DimReduce(**model_specifics["dim_reduction"])
embeddings_reduced = reduction.fit_transform(embeddings_sentence)

In [14]:
print(embeddings_sentence.shape)
print(embeddings_reduced.shape)

(15610, 384)
(15610, 10)


## Data preparation: Time injection and Padding

In [15]:
manifesto_data = nlpsig.PrepareData(manifesto_df,
                                    id_column="doc_id",
                                    labels_column="switched_topic",
                                    embeddings=embeddings_sentence,
                                    embeddings_reduced=embeddings_reduced)

[INFO] Adding time feature columns into dataframe in .df


In [16]:
manifesto_data.df["doc_id"].value_counts()

51421_2015    1917
51320_2019    1702
51620_2015    1588
51620_2017    1496
51421_2019    1467
51320_2017    1328
51620_2019    1223
51421_2017    1131
51902_2019    1071
51320_2015    1009
51902_2015     892
51902_2017     786
Name: doc_id, dtype: int64

In [17]:
manifesto_data.df[manifesto_data.df["doc_id"]=="51902_2017"]

Unnamed: 0,text,cmp_code,topic,switched_topic,party_id,doc_id,datetime,d1,d2,d3,...,e378,e379,e380,e381,e382,e383,e384,time_encoding,time_diff,timeline_index
13753,SNP MPs have used their influence to deliver p...,305.1,3,True,51902,51902_2017,2017-06-01,7.076446,4.272100,7.765581,...,0.005326,0.037123,0.016147,0.055606,0.008570,0.026209,-0.021132,2017.413699,0,1
13754,Here’s just some of what a strong team of SNP ...,305.1,3,False,51902,51902_2017,2017-06-01,6.945519,4.231819,7.687037,...,-0.022241,-0.003946,0.018906,0.033558,-0.001041,0.002696,0.039106,2017.413699,0,2
13755,When the Scotland Bill was going through Westm...,301,3,False,51902,51902_2017,2017-06-01,7.170236,4.472499,8.550349,...,-0.037495,-0.051820,0.005708,0.042825,0.048136,0.034429,-0.040953,2017.413699,0,3
13756,"And it was SNP MPs, working with the Scottish ...",301,3,False,51902,51902_2017,2017-06-01,7.223161,4.530838,8.441557,...,-0.020897,-0.088878,-0.045942,0.015616,-0.015972,0.036915,0.012497,2017.413699,0,4
13757,The SNP secured a deal that ensures Scotland w...,301,3,False,51902,51902_2017,2017-06-01,7.302962,4.455559,8.645349,...,0.020309,-0.034161,-0.037054,-0.051635,-0.082328,-0.005767,-0.041583,2017.413699,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14534,The disgraceful condition of the housing provi...,201.2,2,False,51902,51902_2017,2017-06-01,5.455401,3.690897,8.006092,...,-0.058051,-0.012283,0.056241,-0.059958,-0.062718,0.068832,0.033046,2017.413699,0,782
14535,The Scottish Government’s work to resettle Syr...,201.2,2,False,51902,51902_2017,2017-06-01,5.445345,4.447828,7.652509,...,0.040820,0.043064,0.039066,0.044445,0.076865,0.004159,0.005174,2017.413699,0,783
14536,We will urge the UK government to work with th...,301,3,True,51902,51902_2017,2017-06-01,5.239152,4.309934,7.645834,...,-0.008096,0.020132,0.048608,0.039107,0.038743,-0.023217,-0.010533,2017.413699,0,784
14537,rather than use private contractors who have p...,413,4,True,51902,51902_2017,2017-06-01,5.336844,2.650033,8.115968,...,0.004935,-0.016151,0.000850,-0.079934,-0.067935,0.046949,0.015082,2017.413699,0,785


In [18]:
history_path = manifesto_data.pad(pad_by="history",
                                  method="k_last",
                                  zero_padding=True,
                                  k=5,
                                  time_feature="timeline_index",
                                  embeddings="dim_reduced")

[INFO] Padding ids and storing in .df_padded and .array_padded attributes


In [19]:
history_path.shape

(15610, 5, 13)

In [20]:
history_path[0]

array([['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0],
       ['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0],
       ['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0],
       ['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0],
       ['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0]], dtype=object)

In [21]:
history_path[3]

array([['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0],
       ['51320_2015', -1, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0],
       ['51320_2015', 1, 1, 5.452658653259277, 3.6029112339019775,
        8.050925254821777, 10.075458526611328, 3.9113845825195312,
        9.213820457458496, 5.951642990112305, 4.27547025680542,
        1.1885640621185303, 2.751408576965332],
       ['51320_2015', 0, 2, 5.111911773681641, 3.5011675357818604,
        7.9254584312438965, 10.427611351013184, 3.838214874267578,
        9.157601356506348, 6.385989189147949, 4.594970703125,
        1.2000325918197632, 3.1077473163604736],
       ['51320_2015', 0, 3, 5.043741703033447, 3.5229599475860596,
        7.4684038162231445, 9.6045560836792, 3.8596458435058594,
        9.537117004394531, 6.133909225463867, 4.508759498596191,
        0.9306520223617554, 2.9496004581451416]], dtype=object)