# Batch Legal – BERT-Model-Pipe: Data Retrieval – Preprocessing – Modelling- Dictionaries – Plotly-Function for Streamlit




## Imports for the entire Notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --quiet -U spacy



In [None]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.2 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
!pip install spacy-lookups-data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#Imports

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.collocations import *

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


## Data Retrieval and Exploration

In [None]:
#Jakob, you should enter your code here!

## Preprocessing

In [3]:
#Loading a lot of data from csv
"""CSV with over 5000 documents"""

data = pd.read_csv("/content/drive/MyDrive/over_5000_docs_scraped.csv")

In [None]:
#Starting the actual Preprocessing
df_content = data.Content

In [None]:
#Preproc-Function

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    
    #tokenized_sentence_cleaned = [w for w in tokenized_sentence_cleaned if not w in ignore_list] COMMENTED IGNORE OUT!
    
    sentences = ' '.join(word for word in tokenized_sentence_cleaned)
    
    #spacy
    nlp = spacy.load('en_core_web_sm', disable=["tok2vec", "tagger", "parser", "attribute_ruler"])
    nlp.remove_pipe("lemmatizer")
    nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
    doc = nlp(sentences)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return lemmatized

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# Applying Cleaning Function

clean_txt = df_content.apply(cleaning)

In [None]:
len(clean_txt)

5147

In [None]:
clean_txt_df = pd.DataFrame(clean_txt)

In [None]:
#Save cleaned txt as csv

clean_txt_df.to_csv('/content/drive/MyDrive/5000_docs_clean_txt.csv')

In [4]:
clean_txt_df = pd.read_csv('/content/drive/MyDrive/5000_docs_clean_txt.csv')

In [5]:
clean_txt_df

Unnamed: 0.1,Unnamed: 0,Content
0,0,financial crisis expose weakness transparency ...
1,1,condition make fertiliser available internal m...
2,2,regulation ec aim establish maintain high unif...
3,3,council regulation ec substantially amend seve...
4,4,regulation ec provide authorisation additive u...
...,...,...
5142,5142,commission regulation ec lay detail rule imple...
5143,5143,article regulation eu establish criterium gran...
5144,5144,annex regulation eu establish union list subst...
5145,5145,commission implement regulation eu provide ope...


In [6]:
clean_txt = clean_txt_df['Content']

In [7]:
all_content = ''.join(clean_txt)

In [8]:
from collections import Counter

# split() returns list of all the words in the string
split_it = all_content.split()

# Pass the split_it list to instance of Counter class.
Counters_found = Counter(split_it)
#print(Counters)

# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counters_found.most_common(100)


In [None]:
most_occur

In [8]:
# list used to remove  most frequent words

ignore_list = ['shall', 'may', 'regulation', 'article', 'union', 'state', 'eu', 'official',  'member', 'commission', 'accordance', 'european', 
               'column', 'costum', 'nomenclature', 'directive', 'council', 'journal', 'information', 'agency', 'mssg', 'etf', 'mdssg', 'ltd', 'annex', 
              'reg', 'solas', 'resmsc', 'hsc', 'ed', 'res', 'incl', 'corr', 'msccirc', 'msc', 'ec', 'eec', 'no', 'en', 'code', 'ii', 'iii',
               'xi', 'goi', 'pepp', 'imo', 'mm', 'unka', 'prask', 'tsg']

In [9]:
#Function to get rid of these terms

def ignore(sentence):
  tokenized_sentence = word_tokenize(sentence) ## tokenizing 
  cleaned  = [w for w in tokenized_sentence if not w in ignore_list]
  sentence_cleaned = ' '.join(word for word in cleaned)
  return sentence_cleaned

In [10]:
#Applying function

txt_clean = clean_txt.apply(ignore)

In [11]:
txt_clean.head()

0    financial crisis expose weakness transparency ...
1    condition make fertiliser available internal m...
2    aim establish maintain high uniform level civi...
3    substantially amend several time interest clar...
4    provide authorisation additive use animal nutr...
Name: Content, dtype: object

In [12]:
#Saving txt_clean

txt_clean.to_csv('/content/drive/MyDrive/5000_txt_clean.csv')

In [13]:
#Loading txt_clean

txt_clean = pd.read_csv('/content/drive/MyDrive/5000_txt_clean.csv')

In [7]:
txt_clean

Unnamed: 0.1,Unnamed: 0,Content
0,0,financial crisis expose weakness transparency ...
1,1,condition make fertiliser available internal m...
2,2,aim establish maintain high uniform level civi...
3,3,substantially amend several time interest clar...
4,4,provide authorisation additive use animal nutr...
...,...,...
5142,5142,lay detail rule implement system additional im...
5143,5143,establish criterium grant tariff preference ge...
5144,5144,annex establish list substance may add one cat...
5145,5145,implement provide open annual import tariff qu...


In [14]:
data.Content = txt_clean['Content']

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,cellar,date,dir_code,dir_1,dir_2,dir_3,dir_4,dir_5,dir_6,Content
0,0,Regulation (EU) No 600/2014 of the European Pa...,3b729ddf-f1f7-11e3-8cd4-01aa75ed71a1,2014-06-12,6202025.0,Right of establishment and freedom to provide ...,Sectoral application,Service activities,Stock exchanges and other securities markets,,,financial crisis expose weakness transparency ...
1,1,Regulation (EU) 2019/1009 of the European Parl...,e351eb07-9713-11e9-9369-01aa75ed71a1,2019-06-25,133019.0,Industrial policy and internal market,Internal market: approximation of laws,Fertilisers,,,,condition make fertiliser available internal m...
2,2,Commission Regulation (EU) No 139/2014 of 12 F...,3cc00bdc-955e-11e3-8c34-01aa75ed71a1,2014-02-14,74030.0,Transport policy,Air transport,Air safety,,,,aim establish maintain high uniform level civi...
3,3,Regulation (EU) 2018/196 of the European Parli...,fa9532bb-12e6-11e8-9253-01aa75ed71a1,2018-02-16,2303040.0,Customs Union and free movement of goods,Application of the Common Customs Tariff,Tariff derogations,Reintroduction of customs duties,,,substantially amend several time interest clar...
4,4,Commission Implementing Regulation (EU) 2021/4...,61d76918-8b7a-11eb-b85c-01aa75ed71a1,2021-03-23,35010.0,Agriculture,Approximation of laws and health measures,Animal feedingstuffs,,,,provide authorisation additive use animal nutr...


In [10]:
#Transforming Series in List

txt_clean = txt_clean.Content.tolist()

# Topic-Modelling with BERTopic




In [23]:
#PIP-installing BERTtopic

!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
from bertopic import BERTopic #BERTtopic-model: https://github.com/MaartenGr/BERTopic

# Topic-Modelling Entire Data

### BERTopic-model

In [13]:
#Training

topic_model_all = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics_all, probs = topic_model_all.fit_transform(txt_clean)

Batches:   0%|          | 0/161 [00:00<?, ?it/s]

2022-06-08 14:28:12,782 - BERTopic - Transformed documents to Embeddings
2022-06-08 14:28:42,182 - BERTopic - Reduced dimensionality
2022-06-08 14:28:45,461 - BERTopic - Clustered reduced embeddings


In [14]:
topic_model_all.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,845,-1_activity_market_producer_datum
1,0,226,0_institution_exposure_credit_resolution
2,1,135,1_animal_establishment_disease_consignment
3,2,132,2_biocidal_producttype_active_substance
4,3,127,3_fish_vessel_catch_fishery
...,...,...,...
129,128,11,128_de_vinhais_pgi_register
130,129,11,129_office_irregularity_directorgeneral_dollar
131,130,11,130_name_register_therefrom_enter
132,131,10,131_licence_quantity_coefficient_importer


In [15]:
topic_model_all.visualize_barchart()

In [None]:
topic_model_all.get_topics()

In [None]:
topic_model_all.visualize_topics()

In [None]:
topic_model_all.visualize_hierarchy()

### LDA-Model

In [None]:
# bigram vectorization

vectorizer_n_gram = TfidfVectorizer(ngram_range = (1,1)) # BI-GRAMS
cleaned_vectorizer_n_gram = vectorizer_n_gram.fit_transform(txt_clean)

In [None]:
df_animal = pd.DataFrame(cleaned_vectorizer_n_gram.toarray(), columns=vectorizer_n_gram.get_feature_names_out())

In [None]:
#Topic model function from ML-10-lecture
def print_topics(model, vectorizer, top_words):
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print([(vectorizer_n_gram.get_feature_names_out()[i], round(topic[i],2))
                        for i in topic.argsort()[:-top_words - 1:-1]])

In [None]:
# Instantiating the LDA 
n_components = 10
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(df_animal)

LatentDirichletAllocation(max_iter=100, n_components=2)

In [None]:
print_topics(lda_model, cleaned_vectorizer_n_gram, top_words=20)

--------------------
Topic 0:
[('additive', 154.85), ('ec', 152.87), ('use', 143.37), ('product', 133.5), ('annex', 130.22), ('provide', 128.76), ('authority', 123.43), ('information', 116.81), ('measure', 114.51), ('set', 107.09), ('column', 104.06), ('import', 99.36), ('substance', 95.04), ('follow', 93.17), ('may', 93.11), ('good', 92.79), ('period', 89.33), ('application', 89.15), ('animal', 87.43), ('apply', 86.52)]
--------------------
Topic 1:
[('import', 151.91), ('value', 144.45), ('standard', 143.94), ('amendment', 131.03), ('implement', 111.66), ('fix', 106.31), ('name', 103.42), ('journal', 98.58), ('day', 87.9), ('enter', 86.71), ('register', 85.62), ('specification', 80.07), ('uruguay', 73.6), ('xvi', 72.33), ('annex', 71.92), ('variable', 70.71), ('negotiation', 70.48), ('multilateral', 68.53), ('round', 68.4), ('stipulate', 66.57)]


#Creating a function to speed up Modelling of Sub_Directories

In [108]:
!pip install hdbscan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
import numpy as np
from umap import UMAP

from typing import List
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

def model_to_figure(data, dir, name):
  df_selec = data.loc[data[f'dir_{dir}'] == name]
  txt = df_selec.Content
  txt = txt.tolist()
  
  #Training model
  #umap_model = UMAP(init='random')
  
  model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
  topics, probs = model.fit_transform(txt)

  #Building Dict
  dictionary = {"Sub_dir_name": name, "get_topic": model.get_topics(), "topic_freq": model.get_topic_freq(), "topic_sizes": model.topic_sizes}

  #Getting Embeddings for the visualize_topic_function
  # Select topics based on top_n and topics args
  freq_df = model.get_topic_freq()
  topics = sorted(freq_df.Topic.to_list())

  # Extract topic words and their frequencies
  topic_list = sorted(topics)
  frequencies = [model.topic_sizes[topic] for topic in topic_list]
  words = [" | ".join([word[0] for word in model.get_topic(topic)[:5]]) for topic in topic_list]

  # Embed c-TF-IDF into 2D
  all_topics = sorted(list(model.get_topics().keys()))
  indices = np.array([all_topics.index(topic) for topic in topics])
  if len(topics) > 1:
    embeddings_1 = model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings_1)
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger').fit_transform(embeddings)
  else:
    embeddings = []

  if len(topics) != 1:
    distance_matrix = 1 - cosine_similarity(embeddings_1)
  else:
    distance_matrix = []

  return dictionary, embeddings, distance_matrix

##Bar-Chart Function

In [44]:
import itertools
import numpy as np
from typing import List

import plotly.graph_objects as go
from plotly.subplots import make_subplots

def bert_bar(topic_freq, get_topic,
             topics: List[int] = None,
             top_n_topics: int = None,
             n_words: int = 5,
             width: int = 250,
             height: int = 250) -> go.Figure:
  
  
  colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])
  
  # Select topics based on top_n and topics args
  freq_df = topic_freq
  #freq_df = freq_df.loc[freq_df.Topic != -1, :]
  if topics is not None:
      topics = list(topics)
  elif top_n_topics is not None:
      topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
  else:
      topics = sorted(freq_df.Topic.to_list())
  
  print(topics)

  # Initialize figure
  subplot_titles = [f"Topic {topic}" for topic in topics]
  columns = 4
  rows = int(np.ceil(len(topics) / columns))
  fig = make_subplots(rows=rows,
                      cols=columns,
                      shared_xaxes=False,
                      horizontal_spacing=.1,
                      vertical_spacing=.4 / rows if rows > 1 else 0,
                      subplot_titles=subplot_titles)

  # Add barchart for each topic
  row = 1
  column = 1
  for topic in topics:
    words = [word + "  " for word, _ in get_topic[topic]][:n_words][::-1]
    scores = [score for _, score in get_topic[topic]][:n_words][::-1]
    
    fig.add_trace(
        go.Bar(x=scores,
                   y=words,
                   orientation='h',
                   marker_color=next(colors)),
        row=row, col=column)

    if column == columns:
        column = 1
        row += 1
    else:
        column += 1

  # Stylize graph
  fig.update_layout(
      template="plotly_white",
      showlegend=False,
      title={
            'text': "<b>Topic Word Scores",
            'x': .5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
      },
      width=width*4,
      height=height*rows if rows > 1 else height * 1.3,
      hoverlabel=dict(
          bgcolor="white",
          font_size=16,
          font_family="Rockwell"
      ),
  )

  fig.update_xaxes(showgrid=True)
  fig.update_yaxes(showgrid=True)

  return fig


##Topics Function


In [18]:
import numpy as np
import pandas as pd
from typing import List

import plotly.express as px
import plotly.graph_objects as go


def visualize_topics(topic_freq, topic_sizes, get_topic, embeddings,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650) -> go.Figure:
    """ Visualize topics, their sizes, and their corresponding words

    This visualization is highly inspired by LDAvis, a great visualization
    technique typically reserved for LDA.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        width: The width of the figure.
        height: The height of the figure.

    Usage:

    To visualize the topics simply run:

    ```python
    topic_model.visualize_topics()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_topics()
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/viz.html"
    style="width:1000px; height: 680px; border: 0px;""></iframe>
    """
    # Select topics based on top_n and topics args
    freq_df = topic_freq
    #freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_sizes[topic] for topic in topic_list]
    words = [" | ".join([word[0] for word in get_topic[topic][:5]]) for topic in topic_list]

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[:, 0], "y": embeddings[:, 1],
                       "Topic": topic_list, "Words": words, "Size": frequencies})
    return _plotly_topic_visualization(df, topic_list, width, height)


def _plotly_topic_visualization(df: pd.DataFrame,
                                topic_list: List[str],
                                width: int,
                                height: int):
    """ Create plotly-based visualization of topics with a slider for topic selection """

    def get_color(topic_selected):
        if topic_selected == -1:
            marker_color = ["#B0BEC5" for _ in topic_list]
        else:
            marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list]
        return [{'marker.color': [marker_color]}]

    # Prepare figure range
    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))

    # Plot topics
    fig = px.scatter(df, x="x", y="y", size="Size", size_max=40, template="simple_white", labels={"x": "", "y": ""},
                     hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False})
    fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color='DarkSlateGrey')))

    # Update hover order
    fig.update_traces(hovertemplate="<br>".join(["<b>Topic %{customdata[0]}</b>",
                                                 "Words: %{customdata[1]}",
                                                 "Size: %{customdata[2]}"]))

    # Create a slider for topic selection
    steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list]
    sliders = [dict(active=0, pad={"t": 50}, steps=steps)]

    # Stylize layout
    fig.update_layout(
        title={
            'text': "<b>Intertopic Distance Map",
            'y': .95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        xaxis={"visible": False},
        yaxis={"visible": False},
        sliders=sliders
    )

    # Update axes ranges
    fig.update_xaxes(range=x_range)
    fig.update_yaxes(range=y_range)

    # Add grid in a 'plus' shape
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
    fig.data = fig.data[::-1]

    return fig


## Hierachy viz

In [19]:
import numpy as np
from scipy.cluster.hierarchy import linkage
from typing import List
from sklearn.metrics.pairwise import cosine_similarity

import plotly.graph_objects as go
import plotly.figure_factory as ff


def visualize_hierarchy(topic_freq, get_topic, distance_matrix,
                        orientation: str = "left",
                        topics: List[int] = None,
                        top_n_topics: int = None,
                        width: int = 1000,
                        height: int = 600) -> go.Figure:
    """ Visualize a hierarchical structure of the topics

    A ward linkage function is used to perform the
    hierarchical clustering based on the cosine distance
    matrix between topic embeddings.

    Arguments:
        topic_model: A fitted BERTopic instance.
        orientation: The orientation of the figure.
                     Either 'left' or 'bottom'
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        width: The width of the figure. Only works if orientation is set to 'left'
        height: The height of the figure. Only works if orientation is set to 'bottom'

    Returns:
        fig: A plotly figure
    """

    # Select topics based on top_n and topics args
    freq_df = topic_freq
    #freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    # Select embeddings
    #all_topics = sorted(list(get_topic.keys()))
    #indices = np.array([all_topics.index(topic) for topic in topics])
    #embeddings = embeds[indices]

    # Create dendogram
    distance_matrix = distance_matrix
    fig = ff.create_dendrogram(distance_matrix,
                               orientation=orientation,
                               linkagefun=lambda x: linkage(x, "ward"),
                               color_threshold=1)

    # Create nicer labels
    axis = "yaxis" if orientation == "left" else "xaxis"
    new_labels = [[[str(topics[int(x)]), None]] + get_topic[topics[int(x)]]
                  for x in fig.layout[axis]["ticktext"]]
    new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
    new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]

    # Stylize layout
    fig.update_layout(
        plot_bgcolor='#ECEFF1',
        template="plotly_white",
        title={
            'text': "<b>Hierarchical Clustering",
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    # Stylize orientation
    if orientation == "left":
        fig.update_layout(height=200+(15*len(topics)),
                          width=width,
                          yaxis=dict(tickmode="array",
                                     ticktext=new_labels))
        
        # Fix empty space on the bottom of the graph
        y_max = max([trace['y'].max()+5 for trace in fig['data']])
        y_min = min([trace['y'].min()-5 for trace in fig['data']])
        fig.update_layout(yaxis=dict(range=[y_min, y_max]))

    else:
        fig.update_layout(width=200+(15*len(topics)),
                          height=height,
                          xaxis=dict(tickmode="array",
                                     ticktext=new_labels))
    return fig


#Getting necessary Data for Viz of all dir_1 Dics

In [20]:
import pickle

In [21]:
dir_1 = ["General, financial and institutional matters",
         "Customs Union and free movement of goods",
         "Agriculture",
         #"Fisheries", 
         #"Freedom of movement for workers and social policy",
         #"Right of establishment and freedom to provide services",
         "Transport policy",
         "Competition policy", 
         "Taxation",
         #"Economic and monetary policy and free movement of capital",
         "External relations",
         #"Energy",
         "Industrial policy and internal market",
         "Regional policy and coordination of structural instruments",
         "Environment, consumers and health protection",
         #"Science, information, education and culture",
         #"Law relating to undertakings"
         #"Common Foreign and Security Policy",
         #"Area of freedom, security and justice"
         #"People's Europe" 
         ]

In [22]:
#Function to creat viz_data

def get_viz_data(data, dir, directory):
  topics = []
  embeddings = []
  distances = []

  for name in directory:
    temp = name.split(' ')
    nam = temp[0].replace(',', '')
    embeds = nam+'_embeds'
    dist = nam+'_dist'

    print("* * *")
    print(nam, embeds, dist)

    nombre_1 = embeds
    nombre_2 = dist

    nam, embeds, dist = model_to_figure(data, dir, name)
    
    embed = {f'{nombre_1}': embeds}
    distance = {f'{nombre_2}': dist}

    topics.append(nam)
    embeddings.append(embed)
    distances.append(distance)

  return pd.DataFrame(topics), embeddings, distances

In [23]:
topics_dir1_df, embeddings_lst, distances_lst = get_viz_data(data, 1, dir_1)

* * *
General General_embeds General_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:02:31,798 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:02:39,492 - BERTopic - Reduced dimensionality
2022-06-08 15:02:39,507 - BERTopic - Clustered reduced embeddings


* * *
Customs Customs_embeds Customs_dist


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2022-06-08 15:02:48,126 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:02:51,773 - BERTopic - Reduced dimensionality
2022-06-08 15:02:51,804 - BERTopic - Clustered reduced embeddings


* * *
Agriculture Agriculture_embeds Agriculture_dist


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

2022-06-08 15:03:07,613 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:03:13,646 - BERTopic - Reduced dimensionality
2022-06-08 15:03:13,798 - BERTopic - Clustered reduced embeddings


* * *
Transport Transport_embeds Transport_dist


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2022-06-08 15:03:25,547 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:03:28,393 - BERTopic - Reduced dimensionality
2022-06-08 15:03:28,408 - BERTopic - Clustered reduced embeddings


* * *
Competition Competition_embeds Competition_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:03:37,321 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:03:40,009 - BERTopic - Reduced dimensionality
2022-06-08 15:03:40,019 - BERTopic - Clustered reduced embeddings


* * *
Taxation Taxation_embeds Taxation_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:03:45,988 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:03:48,376 - BERTopic - Reduced dimensionality
2022-06-08 15:03:48,384 - BERTopic - Clustered reduced embeddings


* * *
External External_embeds External_dist


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2022-06-08 15:03:59,541 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:04:02,917 - BERTopic - Reduced dimensionality
2022-06-08 15:04:02,953 - BERTopic - Clustered reduced embeddings


* * *
Industrial Industrial_embeds Industrial_dist


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2022-06-08 15:04:16,509 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:04:19,839 - BERTopic - Reduced dimensionality
2022-06-08 15:04:19,868 - BERTopic - Clustered reduced embeddings


* * *
Regional Regional_embeds Regional_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:04:29,520 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:04:31,908 - BERTopic - Reduced dimensionality
2022-06-08 15:04:31,917 - BERTopic - Clustered reduced embeddings


* * *
Environment Environment_embeds Environment_dist


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

2022-06-08 15:04:41,559 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:04:46,681 - BERTopic - Reduced dimensionality
2022-06-08 15:04:46,803 - BERTopic - Clustered reduced embeddings


In [131]:
topics_dir1_df = pd.read_pickle('/content/drive/MyDrive/topics_dir1_df.pkl')

In [35]:
topics_dir1_df

Unnamed: 0,Sub_dir_name,get_topic,topic_freq,topic_sizes
0,"General, financial and institutional matters","{-1: [('financial', 0.04624870146657455), ('re...",Topic Count 0 -1 60,{-1: 60}
1,Customs Union and free movement of goods,"{0: [('custom', 0.040475725724029056), ('good'...",Topic Count 0 0 101 1 1 7...,"{0: 101, 1: 78, 2: 42, 3: 35, 4: 34, 5: 22, 6:..."
2,Agriculture,"{-1: [('product', 0.021337793306979914), ('use...",Topic Count 0 -1 238 1 0 ...,"{-1: 238, 0: 95, 1: 89, 2: 62, 3: 51, 4: 51, 5..."
3,Transport policy,"{-1: [('safety', 0.060093690323938055), ('port...",Topic Count 0 0 44 1 1 2...,"{0: 44, 1: 27, -1: 21, 2: 12}"
4,Competition policy,"{-1: [('aid', 0.15022166428861436), ('grant', ...",Topic Count 0 -1 13,{-1: 13}
5,Taxation,"{-1: [('apply', 0.060815607544691466), ('use',...",Topic Count 0 -1 22,{-1: 22}
6,External relations,"{-1: [('gsp', 0.3934576905947057), ('annex', 0...",Topic Count 0 0 258 1 1 4...,"{0: 258, 1: 49, 2: 31, 3: 24, 4: 15, 5: 14, 6:..."
7,Industrial policy and internal market,"{-1: [('list', 0.30949523899557874), ('prodcom...",Topic Count 0 0 123 1 1 9...,"{0: 123, 1: 91, 2: 72, 3: 26, 4: 24, 5: 17, 6:..."
8,Regional policy and coordination of structural...,"{-1: [('programme', 0.07835551446878854), ('fu...",Topic Count 0 -1 26,{-1: 26}
9,"Environment, consumers and health protection","{-1: [('name', 0.05054435930257884), ('registe...",Topic Count 0 0 225 1 -1 ...,"{0: 225, -1: 174, 1: 57, 2: 57, 3: 47, 4: 46, ..."


In [171]:
with open('/content/drive/MyDrive/embeddings_dir1.pkl', 'rb') as handle:
    embeddings_lst = pickle.load(handle)

In [135]:
with open('/content/drive/MyDrive/distances_dir1.pkl', 'rb') as handle:
    distances_lst = pickle.load(handle)

In [24]:
bert_bar(topics_dir1_df.iloc[1].topic_freq, topics_dir1_df.iloc[1].get_topic)

[0, 1, 2, 3, 4, 5]


In [25]:
visualize_topics(topics_dir1_df.iloc[1].topic_freq, topics_dir1_df.iloc[1].topic_sizes, topics_dir1_df.iloc[1].get_topic, embeddings_lst[1]['Customs_embeds'])

In [26]:
visualize_hierarchy(topics_dir1_df.iloc[1].topic_freq, topics_dir1_df.iloc[1].get_topic, distances_lst[1]['Customs_dist'])

In [27]:
topics_dir1_df.to_pickle('/content/drive/MyDrive/topics_dir1_df.pkl')

In [28]:
with open('/content/drive/MyDrive/embeddings_dir1.pkl', 'wb') as handle:
    pickle.dump(embeddings_lst, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
with open('/content/drive/MyDrive/distances_dir1.pkl', 'wb') as handle:
    pickle.dump(distances_lst, handle, protocol=pickle.HIGHEST_PROTOCOL)

##Getting necessary Data for Viz of all *dir_2* Dics

In [16]:
overview_df = pd.read_csv('/content/drive/MyDrive/over_2800_docs_scraped.csv')

In [None]:
overview_df.head()

In [None]:
overview_df.dir_2.unique()

In [34]:
dir_2 = [#'Sectoral application', 
         #'Inland transport', 
         'Commercial policy',
         'Social policy', 
         'Health protection', 
         #'Environment',
         'Financial and budgetary provisions', 
         #'Economic policy',
         'Shipping', 
         #'Transport infrastructure',
         #'Economic and social cohesion fund', 
         #'Trans-European networks',
       'Development policy',
       #'General principles', 
       #'programmes and statistics',
       'Industrial policy: sectoral operations', 
       #'Education and training',
       'Industrial policy: general, programmes, statistics and research',
       'General', 
       'Provisions governing the institutions',
       #'European citizenship', 
       'Application of the Common Customs Tariff',
       #'Common fisheries policy',
       #'Products subject to market organisation',
       #'Approximation of laws and health measures', 
       #'Basic provisions',
       #'Agricultural structures', 
       'Consumers', 
       'Air transport',
       #'Basic customs instruments',
       #'Principles, objectives and tasks of the Treaties',
       'Agricultural structural funds',
       #'Action in favour of countries in transition',
       'Dissemination of information', 
       #'Free movement of persons',
       'Police and judicial cooperation in criminal and customs matters',
       'Judicial cooperation in civil matters',
       #'Internal market: policy relating to undertakings', 
       #'Culture',
       'General principles and programmes', 
       #'Intellectual property law',
       'External relations', 
       'Internal market: approximation of laws',
       'Statistics', 
       #'Monetary measures',
       'Coordination of structural instruments',
       'Bilateral agreements with non-member countries', 
       'Programmes',
       #'Company law', 
       #'Specific customs rules', 
       'General customs rules',
       #'Indirect taxation', 
       #'Science', 
       #'Free movement of capital',
       #'Prevention of tax evasion and avoidance', 
       #'Monetary policy',
       #'Other sources of energy', 
       #'Multilateral relations',
       #'Nuclear energy', 
       #'Protection of animals',
       #'State aids and other subsidies', 
       #'Competition principles',
       #'Public contracts', 
       #'Principles and conditions', 
       'Electricity',
       #'Restrictive practices',
       #'European Regional Development Fund (ERDF)',
       #'Freedom of movement for workers', 
       #'Oil and gas',
       #'Products not subject to market organisation', 
       #'Economic and commercial law'
       ]

In [35]:
topics_dir2_df, embeddings2_lst, distances2_lst = get_viz_data(data, 2, dir_2)

* * *
Commercial Commercial_embeds Commercial_dist


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2022-06-08 15:13:01,100 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:13:04,078 - BERTopic - Reduced dimensionality
2022-06-08 15:13:04,114 - BERTopic - Clustered reduced embeddings


* * *
Social Social_embeds Social_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:13:14,918 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:13:17,396 - BERTopic - Reduced dimensionality
2022-06-08 15:13:17,408 - BERTopic - Clustered reduced embeddings


* * *
Health Health_embeds Health_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:13:23,444 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:13:25,821 - BERTopic - Reduced dimensionality
2022-06-08 15:13:25,829 - BERTopic - Clustered reduced embeddings


* * *
Financial Financial_embeds Financial_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:13:32,589 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:13:35,009 - BERTopic - Reduced dimensionality
2022-06-08 15:13:35,018 - BERTopic - Clustered reduced embeddings


* * *
Shipping Shipping_embeds Shipping_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:13:41,137 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:13:43,545 - BERTopic - Reduced dimensionality
2022-06-08 15:13:43,556 - BERTopic - Clustered reduced embeddings


* * *
Development Development_embeds Development_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:13:49,554 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:13:52,375 - BERTopic - Reduced dimensionality
2022-06-08 15:13:52,385 - BERTopic - Clustered reduced embeddings


* * *
Industrial Industrial_embeds Industrial_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:13:58,571 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:14:01,057 - BERTopic - Reduced dimensionality
2022-06-08 15:14:01,066 - BERTopic - Clustered reduced embeddings


* * *
Industrial Industrial_embeds Industrial_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:14:07,337 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:14:09,746 - BERTopic - Reduced dimensionality
2022-06-08 15:14:09,758 - BERTopic - Clustered reduced embeddings


* * *
General General_embeds General_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:14:15,936 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:14:18,813 - BERTopic - Reduced dimensionality
2022-06-08 15:14:18,823 - BERTopic - Clustered reduced embeddings


* * *
Provisions Provisions_embeds Provisions_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:14:24,869 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:14:27,316 - BERTopic - Reduced dimensionality
2022-06-08 15:14:27,326 - BERTopic - Clustered reduced embeddings


* * *
Application Application_embeds Application_dist


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2022-06-08 15:14:34,538 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:14:37,589 - BERTopic - Reduced dimensionality
2022-06-08 15:14:37,617 - BERTopic - Clustered reduced embeddings


* * *
Consumers Consumers_embeds Consumers_dist


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

2022-06-08 15:14:49,157 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:14:53,839 - BERTopic - Reduced dimensionality
2022-06-08 15:14:53,958 - BERTopic - Clustered reduced embeddings


* * *
Air Air_embeds Air_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:03,382 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:06,278 - BERTopic - Reduced dimensionality
2022-06-08 15:15:06,292 - BERTopic - Clustered reduced embeddings


* * *
Agricultural Agricultural_embeds Agricultural_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:12,707 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:15,144 - BERTopic - Reduced dimensionality
2022-06-08 15:15:15,158 - BERTopic - Clustered reduced embeddings


* * *
Dissemination Dissemination_embeds Dissemination_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:21,170 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:23,599 - BERTopic - Reduced dimensionality
2022-06-08 15:15:23,609 - BERTopic - Clustered reduced embeddings


* * *
Police Police_embeds Police_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:29,542 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:32,416 - BERTopic - Reduced dimensionality
2022-06-08 15:15:32,426 - BERTopic - Clustered reduced embeddings


* * *
Judicial Judicial_embeds Judicial_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:38,509 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:40,897 - BERTopic - Reduced dimensionality
2022-06-08 15:15:40,906 - BERTopic - Clustered reduced embeddings


* * *
General General_embeds General_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:47,181 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:50,067 - BERTopic - Reduced dimensionality
2022-06-08 15:15:50,078 - BERTopic - Clustered reduced embeddings


* * *
External External_embeds External_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:15:56,157 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:15:58,564 - BERTopic - Reduced dimensionality
2022-06-08 15:15:58,574 - BERTopic - Clustered reduced embeddings


* * *
Internal Internal_embeds Internal_dist


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2022-06-08 15:16:06,970 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:16:09,853 - BERTopic - Reduced dimensionality
2022-06-08 15:16:09,881 - BERTopic - Clustered reduced embeddings


* * *
Statistics Statistics_embeds Statistics_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:16:19,469 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:16:21,882 - BERTopic - Reduced dimensionality
2022-06-08 15:16:21,891 - BERTopic - Clustered reduced embeddings


* * *
Coordination Coordination_embeds Coordination_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:16:27,992 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:16:30,425 - BERTopic - Reduced dimensionality
2022-06-08 15:16:30,435 - BERTopic - Clustered reduced embeddings


* * *
Bilateral Bilateral_embeds Bilateral_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:16:36,331 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:16:39,199 - BERTopic - Reduced dimensionality
2022-06-08 15:16:39,210 - BERTopic - Clustered reduced embeddings


* * *
Programmes Programmes_embeds Programmes_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:16:45,137 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:16:47,525 - BERTopic - Reduced dimensionality
2022-06-08 15:16:47,535 - BERTopic - Clustered reduced embeddings


* * *
General General_embeds General_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:16:53,640 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:16:56,058 - BERTopic - Reduced dimensionality
2022-06-08 15:16:56,068 - BERTopic - Clustered reduced embeddings


* * *
Electricity Electricity_embeds Electricity_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:17:02,182 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:05,009 - BERTopic - Reduced dimensionality
2022-06-08 15:17:05,019 - BERTopic - Clustered reduced embeddings


In [36]:
topics_dir2_df.to_pickle('/content/drive/MyDrive/topics_dir2_df.pkl')

In [37]:
with open('/content/drive/MyDrive/embeddings_dir2.pkl', 'wb') as handle:
    pickle.dump(embeddings2_lst, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [38]:
with open('/content/drive/MyDrive/distances_dir2.pkl', 'wb') as handle:
    pickle.dump(distances2_lst, handle, protocol=pickle.HIGHEST_PROTOCOL)

##Getting necessary Data for Viz of all *dir_3* Dics

In [17]:
overview_df.dir_3.unique()

array(['Service activities', 'Market operation', 'Trade protection',
       'Employment and unemployment', nan, 'Pollution and nuisances',
       'Instruments of economic policy', 'Financial support',
       'European Social Fund (ESF)', 'General',
       'Information technology, telecommunications and data-processing',
       'Research and technological development',
       'General provisions and programmes', 'Economic and monetary union',
       'Programmes and statistics', 'Tariff derogations',
       'Conservation of resources', 'Sheepmeat and goatmeat',
       'Beef and veal', 'Animal health and zootechnics',
       'Common agricultural policy mechanisms', 'Safety at sea',
       'Social and structural measures',
       'Consumer information, education and representation', 'Air safety',
       'Milk products', 'Plant health', 'Animal feedingstuffs',
       'Origin of goods', 'Generalised system of preferences',
       'Trade arrangements', 'Protection of health and safety',
     

In [39]:
dir_3 = [#'Service activities', 
         'Market operation', 
         #'Trade protection',
         #'Employment and unemployment', 
         'Pollution and nuisances',
       'Instruments of economic policy', 
       #'Financial support',
       #'European Social Fund (ESF)', 
       'General',
       'Information technology, telecommunications and data-processing',
       #'Research and technological development',
       #'General provisions and programmes', 
       #'Economic and monetary union',
       #'Programmes and statistics', 
       #'Tariff derogations',
       #'Conservation of resources', 
       #'Sheepmeat and goatmeat',
       #'Beef and veal', 
       #'Animal health and zootechnics',
       #'Common agricultural policy mechanisms', 
       #'Safety at sea',
       'Social and structural measures',
       'Consumer information, education and representation', 
       'Air safety',
       'Milk products', 
       'Plant health', 
       #'Animal feedingstuffs',
       'Origin of goods', 
       'Generalised system of preferences',
       'Trade arrangements', 
       #'Protection of health and safety',
       #'Working conditions', 
       'Structural measures',
       #'European Agricultural Fund for Rural Development',
       'European Agricultural Guarantee Fund',
       #'Immigration and the right of nationals of third countries',
       'Crossing external borders', 
       'Market organisation',
       'Rational utilisation and conservation of energy',
       #'Multilateral relations', 
       'Proprietary medicinal products',
       'Motor vehicles', 
       #'Foodstuffs', 
       'Protection of economic interests',
       'Tariff classification', 
       #'Customs tariffs', 
       #'Cereals', 
       'Wine',
       #'Rice', 
       #'Eggs and poultry', 
       #'Structural harmonisation',
       'General social provisions', 
       'Other commercial policy measures',
       #'European countries', 
       #'Judicial cooperation in criminal matters',
       #'Asylum policy', 
       #'Movement of goods', 
       'Fresh fruit and vegetables',
       'Common customs territory',
       #'Other sectors for approximation of laws', 
       #'Turnover tax/VAT',
       #'Dried fodder', 
       'Oils and fats',
       'Arrangements covering more than one market organisation',
       'Dangerous substances', 
       #'Own resources', 
       #'Research sectors',
       #'Elimination of internal border controls',
       'Space, environment and natural resources', 
       #'Customs cooperation',
       #'Institutional monetary provisions',
       'Institutional economic provisions',
       #'African, Caribbean and Pacific (ACP) Group of States',
       'Agreements with non-member countries',
       #'Indirect instruments of monetary policy',
       #'Approximation of certain social provisions',
       #'General, programmes', 
       #'Other spheres of multilateral cooperation',
       #'Seeds and seedlings', 
       #'Nuclear research', 
       #'Excise duties', 
       #'Hops',
       #'National aid', 
       'Agricultural and forestry tractors', 
       #'Cosmetics',
       #'Accountancy data network', 
       #'Fertilisers',
       #'Multilateral cooperation for protection of the environment, wild fauna and flora and natural resources',
       #'Supervision procedures', 
       #'Financial and economic Aid',
       #'User tariffs', 
       #'Budget', 
       #'Raw tobacco', 
       #'Police cooperation',
       #'Supplies and stocks',
       #'Products processed from fruit and vegetables',
       #'Aid to developing countries', 
       #'Peas and beans',
       #'Other measures relating to nuclear energy',
       #'Other economic and commercial provisions', 
       'Sugar'
       #'Court of Justice'
       ]

In [40]:
topics_dir3_df, embeddings3_lst, distances3_lst = get_viz_data(data, 3, dir_3)

* * *
Market Market_embeds Market_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:17:11,327 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:13,752 - BERTopic - Reduced dimensionality
2022-06-08 15:17:13,761 - BERTopic - Clustered reduced embeddings


* * *
Pollution Pollution_embeds Pollution_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:17:20,111 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:22,597 - BERTopic - Reduced dimensionality
2022-06-08 15:17:22,610 - BERTopic - Clustered reduced embeddings


* * *
Instruments Instruments_embeds Instruments_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:17:29,039 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:31,969 - BERTopic - Reduced dimensionality
2022-06-08 15:17:31,977 - BERTopic - Clustered reduced embeddings


* * *
General General_embeds General_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:17:38,087 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:40,477 - BERTopic - Reduced dimensionality
2022-06-08 15:17:40,487 - BERTopic - Clustered reduced embeddings


* * *
Information Information_embeds Information_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:17:46,638 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:49,080 - BERTopic - Reduced dimensionality
2022-06-08 15:17:49,093 - BERTopic - Clustered reduced embeddings


* * *
Social Social_embeds Social_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:17:55,334 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:17:58,217 - BERTopic - Reduced dimensionality
2022-06-08 15:17:58,230 - BERTopic - Clustered reduced embeddings


* * *
Consumer Consumer_embeds Consumer_dist


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2022-06-08 15:18:06,011 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:18:10,404 - BERTopic - Reduced dimensionality
2022-06-08 15:18:10,511 - BERTopic - Clustered reduced embeddings


* * *
Air Air_embeds Air_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:18:19,589 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:18:22,474 - BERTopic - Reduced dimensionality
2022-06-08 15:18:22,483 - BERTopic - Clustered reduced embeddings


* * *
Milk Milk_embeds Milk_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:18:28,523 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:18:30,959 - BERTopic - Reduced dimensionality
2022-06-08 15:18:30,969 - BERTopic - Clustered reduced embeddings


* * *
Plant Plant_embeds Plant_dist


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2022-06-08 15:18:38,247 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:18:41,322 - BERTopic - Reduced dimensionality
2022-06-08 15:18:41,358 - BERTopic - Clustered reduced embeddings


* * *
Origin Origin_embeds Origin_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:18:50,252 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:18:52,666 - BERTopic - Reduced dimensionality
2022-06-08 15:18:52,675 - BERTopic - Clustered reduced embeddings


* * *
Generalised Generalised_embeds Generalised_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:18:58,503 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:00,909 - BERTopic - Reduced dimensionality
2022-06-08 15:19:00,919 - BERTopic - Clustered reduced embeddings


* * *
Trade Trade_embeds Trade_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:19:06,921 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:09,858 - BERTopic - Reduced dimensionality
2022-06-08 15:19:09,867 - BERTopic - Clustered reduced embeddings


* * *
Structural Structural_embeds Structural_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:19:15,781 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:18,242 - BERTopic - Reduced dimensionality
2022-06-08 15:19:18,252 - BERTopic - Clustered reduced embeddings


* * *
European European_embeds European_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:19:24,339 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:26,806 - BERTopic - Reduced dimensionality
2022-06-08 15:19:26,817 - BERTopic - Clustered reduced embeddings


* * *
Crossing Crossing_embeds Crossing_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:19:32,983 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:35,395 - BERTopic - Reduced dimensionality
2022-06-08 15:19:35,406 - BERTopic - Clustered reduced embeddings


* * *
Market Market_embeds Market_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:19:41,314 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:44,249 - BERTopic - Reduced dimensionality
2022-06-08 15:19:44,260 - BERTopic - Clustered reduced embeddings


* * *
Rational Rational_embeds Rational_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:19:50,274 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:19:52,708 - BERTopic - Reduced dimensionality
2022-06-08 15:19:52,721 - BERTopic - Clustered reduced embeddings


* * *
Proprietary Proprietary_embeds Proprietary_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:19:58,698 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:01,135 - BERTopic - Reduced dimensionality
2022-06-08 15:20:01,145 - BERTopic - Clustered reduced embeddings


* * *
Motor Motor_embeds Motor_dist


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-06-08 15:20:07,752 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:10,721 - BERTopic - Reduced dimensionality
2022-06-08 15:20:10,731 - BERTopic - Clustered reduced embeddings


* * *
Protection Protection_embeds Protection_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:20:17,025 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:19,434 - BERTopic - Reduced dimensionality
2022-06-08 15:20:19,444 - BERTopic - Clustered reduced embeddings


* * *
Tariff Tariff_embeds Tariff_dist


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2022-06-08 15:20:26,427 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:29,419 - BERTopic - Reduced dimensionality
2022-06-08 15:20:29,441 - BERTopic - Clustered reduced embeddings


* * *
Wine Wine_embeds Wine_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:20:38,475 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:40,903 - BERTopic - Reduced dimensionality
2022-06-08 15:20:40,914 - BERTopic - Clustered reduced embeddings


* * *
General General_embeds General_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:20:46,802 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:49,314 - BERTopic - Reduced dimensionality
2022-06-08 15:20:49,324 - BERTopic - Clustered reduced embeddings


* * *
Other Other_embeds Other_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:20:55,208 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:20:58,141 - BERTopic - Reduced dimensionality
2022-06-08 15:20:58,150 - BERTopic - Clustered reduced embeddings


* * *
Fresh Fresh_embeds Fresh_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:04,110 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:06,592 - BERTopic - Reduced dimensionality
2022-06-08 15:21:06,602 - BERTopic - Clustered reduced embeddings


* * *
Common Common_embeds Common_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:12,689 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:15,154 - BERTopic - Reduced dimensionality
2022-06-08 15:21:15,163 - BERTopic - Clustered reduced embeddings


* * *
Oils Oils_embeds Oils_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:21,061 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:24,043 - BERTopic - Reduced dimensionality
2022-06-08 15:21:24,053 - BERTopic - Clustered reduced embeddings


* * *
Arrangements Arrangements_embeds Arrangements_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:30,198 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:32,633 - BERTopic - Reduced dimensionality
2022-06-08 15:21:32,643 - BERTopic - Clustered reduced embeddings


* * *
Dangerous Dangerous_embeds Dangerous_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:38,667 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:41,108 - BERTopic - Reduced dimensionality
2022-06-08 15:21:41,117 - BERTopic - Clustered reduced embeddings


* * *
Space Space_embeds Space_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:46,970 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:49,427 - BERTopic - Reduced dimensionality
2022-06-08 15:21:49,439 - BERTopic - Clustered reduced embeddings


* * *
Institutional Institutional_embeds Institutional_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:21:55,659 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:21:58,622 - BERTopic - Reduced dimensionality
2022-06-08 15:21:58,632 - BERTopic - Clustered reduced embeddings


* * *
Agreements Agreements_embeds Agreements_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:22:04,528 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:22:07,045 - BERTopic - Reduced dimensionality
2022-06-08 15:22:07,054 - BERTopic - Clustered reduced embeddings


* * *
Agricultural Agricultural_embeds Agricultural_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:22:13,173 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:22:15,566 - BERTopic - Reduced dimensionality
2022-06-08 15:22:15,575 - BERTopic - Clustered reduced embeddings


* * *
Sugar Sugar_embeds Sugar_dist


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-06-08 15:22:21,549 - BERTopic - Transformed documents to Embeddings
2022-06-08 15:22:24,480 - BERTopic - Reduced dimensionality
2022-06-08 15:22:24,491 - BERTopic - Clustered reduced embeddings


In [41]:
topics_dir3_df.to_pickle('/content/drive/MyDrive/topics_dir3_df.pkl')

In [42]:
with open('/content/drive/MyDrive/embeddings_dir3.pkl', 'wb') as handle:
    pickle.dump(embeddings3_lst, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
with open('/content/drive/MyDrive/distances_dir3.pkl', 'wb') as handle:
    pickle.dump(distances3_lst, handle, protocol=pickle.HIGHEST_PROTOCOL)

##Directories that we could not model

In [152]:
dir_1_not = ["Fisheries",
         "Freedom of movement for workers and social policy",
         "Right of establishment and freedom to provide services",
         "Economic and monetary policy and free movement of capital",
         "Energy",
         "Science, information, education and culture",
         "Law relating to undertakings"
         "Common Foreign and Security Policy",
         "Area of freedom, security and justice"
         "People's Europe" 
         ]

In [None]:
dir_2_not = ['Sectoral application', 
         'Inland transport', 
         'Environment',
         'Economic policy',
         'Transport infrastructure',
         'Economic and social cohesion fund', 
         'Trans-European networks',
       'General principles', 
       'programmes and statistics',
       'Education and training',
       'European citizenship', 
       'Common fisheries policy',
       'Products subject to market organisation',
       'Basic provisions',
       'Basic customs instruments',
       'Principles, objectives and tasks of the Treaties',
       'Action in favour of countries in transition',
       'Free movement of persons',
       'Internal market: policy relating to undertakings', 
       'Culture',
       'Intellectual property law',
       'Monetary measures',
       'Company law', 
       'Specific customs rules', 
       'Indirect taxation', 
       'Science', 
       'Free movement of capital',
       'Prevention of tax evasion and avoidance', 
       'Monetary policy',
       'Other sources of energy', 
       'Multilateral relations',
       'Nuclear energy', 
       'Protection of animals',
       'State aids and other subsidies', 
       'Competition principles',
       'Public contracts', 
       'Principles and conditions', 
       'Restrictive practices',
       'European Regional Development Fund (ERDF)',
       'Freedom of movement for workers', 
       'Oil and gas',
       'Products not subject to market organisation', 
       'Economic and commercial law'
       ]

In [None]:
dir_3_not = [#'Service activities', 
         #'Trade protection',
         #'Employment and unemployment', 
       #'Financial support',
       #'European Social Fund (ESF)', 
       #'Research and technological development',
       #'General provisions and programmes', 
       #'Economic and monetary union',
       #'Programmes and statistics', 
       #'Tariff derogations',
       #'Conservation of resources', 
       #'Sheepmeat and goatmeat',
       #'Beef and veal', 
       #'Animal health and zootechnics',
       #'Common agricultural policy mechanisms', 
       #'Safety at sea',
       #'Animal feedingstuffs',
       #'Protection of health and safety',
       #'Working conditions', 
       #'European Agricultural Fund for Rural Development',
       #'Immigration and the right of nationals of third countries',
       #'Multilateral relations', 
       #'Foodstuffs', 
       #'Customs tariffs', 
       #'Cereals', 
       #'Rice', 
       #'Eggs and poultry', 
       #'Structural harmonisation',
       #'European countries', 
       #'Judicial cooperation in criminal matters',
       #'Asylum policy', 
       #'Movement of goods', 
       #'Other sectors for approximation of laws', 
       #'Turnover tax/VAT',
       #'Dried fodder', 
       #'Own resources', 
       #'Research sectors',
       #'Elimination of internal border controls',
       #'Customs cooperation',
       #'Institutional monetary provisions',
       #'African, Caribbean and Pacific (ACP) Group of States',
       #'Indirect instruments of monetary policy',
       #'Approximation of certain social provisions',
       #'General, programmes', 
       #'Other spheres of multilateral cooperation',
       #'Seeds and seedlings', 
       #'Nuclear research', 
       #'Excise duties', 
       #'Hops',
       #'National aid', 
       #'Cosmetics',
       #'Accountancy data network', 
       #'Fertilisers',
       #'Multilateral cooperation for protection of the environment, wild fauna and flora and natural resources',
       #'Supervision procedures', 
       #'Financial and economic Aid',
       #'User tariffs', 
       #'Budget', 
       #'Raw tobacco', 
       #'Police cooperation',
       #'Supplies and stocks',
       #'Products processed from fruit and vegetables',
       #'Aid to developing countries', 
       #'Peas and beans',
       #'Other measures relating to nuclear energy',
       #'Other economic and commercial provisions', 
       #'Court of Justice'
       ]