# Semantic Networks

Use networks to study the interactions between categories and terms

In [1]:
import sys
sys.path.append("../../bunkatech")

In [2]:
from bunkatech.networks import SemanticNetworks
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
 
data = pd.read_csv('../data/imdb.csv', index_col = [0])
data = data.sample(2000, random_state = 42)

In [3]:
'''
data = data[['imdb', 'description', 'genre', 'country']].copy().dropna()
data['genre'] = data['genre'].apply(lambda x: x.split(', '))
data['country'] = data['country'].apply(lambda x: x.split(', '))
data = data.explode('genre')
data = data.explode('country')

'''

"\ndata = data[['imdb', 'description', 'genre', 'country']].copy().dropna()\ndata['genre'] = data['genre'].apply(lambda x: x.split(', '))\ndata['country'] = data['country'].apply(lambda x: x.split(', '))\ndata = data.explode('genre')\ndata = data.explode('country')\n\n"

In [4]:
nets = SemanticNetworks(data = data,
                        text_var = 'description',
                        index_var = 'imdb',
                        extract_terms=True,
                        terms_embedding=False,
                        docs_embedding=False,
                        sample_size_terms=2000,
                        terms_limit=2000,
                        terms_ents=False,
                        terms_ngrams=(2, 2),
                        terms_ncs=False,
                        terms_include_pos=["NOUN", "PROPN", "ADJ"],
                        terms_include_types=["PERSON", "ORG"],
                        terms_embedding_model="all-MiniLM-L6-v2",
                        docs_embedding_model="all-MiniLM-L6-v2",
                        language="en",
                        terms_path=None,
                        docs_dimension_reduction = 5,
                        terms_embeddings_path=None,
                        docs_embeddings_path=None,
                        docs_multiprocessing = True,
                        terms_multiprocessing = True)

  0%|                                                                                 | 0/1991 [00:00<?, ?it/s]2023-04-27 11:43:23,798 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,799 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,804 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,817 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,818 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,819 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,820 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2023-04-27 11:43:23,821 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
100%|█████████████████████████████████████████████████████████████████████| 1991/1991 [00:07<00:00, 269.67it/s]


#### Draw the Networks

In [7]:
fig = nets.fit_draw(top_n=500,
                    variables = ['main form'], 
                    global_filter=0.2, # Minimum cosine for two nodes to be connected
                    n_neighbours=8, # minimum number of neightbours a node is directed to
                    method="node2vec", # Use Node2Vec or force_directed algorithm
                    n_cluster=15,
                    bin_number=30,
                    black_hole_force=1, # Add a force to make the clusters aggregate
                    color="community", # 'community' for clusters or 'entity' for node_type
                    size="size",
                    symbol="entity",
                    textfont_size=9,
                    edge_size=0.5,
                    height=2000,
                    width=2000,
                    template="plotly_white")

Computing transition probabilities: 100%|██████████████████████████████████| 404/404 [00:00<00:00, 8212.73it/s]
Generating walks (CPU: 1): 100%|██████████| 1/1 [00:00<00:00, 20.67it/s]
Generating walks (CPU: 2): 100%|██████████| 1/1 [00:00<00:00, 20.64it/s]
Generating walks (CPU: 3): 100%|██████████| 1/1 [00:00<00:00, 20.80it/s]
Generating walks (CPU: 4): 100%|██████████| 1/1 [00:00<00:00, 20.70it/s]
Generating walks (CPU: 5): 100%|██████████| 1/1 [00:00<00:00, 20.91it/s]
Generating walks (CPU: 6): 100%|██████████| 1/1 [00:00<00:00, 20.95it/s]
Generating walks (CPU: 7): 100%|██████████| 1/1 [00:00<00:00, 20.02it/s]
Generating walks (CPU: 8): 100%|██████████| 1/1 [00:00<00:00, 19.81it/s]
Generating walks (CPU: 9): 100%|██████████| 1/1 [00:00<00:00, 20.68it/s]
Generating walks (CPU: 10): 100%|██████████| 1/1 [00:00<00:00, 20.91it/s]


#### Save the plots as an html file

In [8]:
import plotly 

plotly.offline.plot(fig, filename = 'filename.html', auto_open=True)

'filename.html'