# Semantic Networks

Use networks to study the interactions between categories and terms

In [1]:
import sys
sys.path.append("../../bunkatech")

In [2]:
from bunkatech.networks import SemanticNetworks
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
 

data = pd.read_csv('../data/imdb.csv', index_col = [0])
data = data.sample(2000, random_state = 42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
'''
data = data[['imdb', 'description', 'genre', 'country']].copy().dropna()
data['genre'] = data['genre'].apply(lambda x: x.split(', '))
data['country'] = data['country'].apply(lambda x: x.split(', '))
data = data.explode('genre')
data = data.explode('country')

'''

"\ndata = data[['imdb', 'description', 'genre', 'country']].copy().dropna()\ndata['genre'] = data['genre'].apply(lambda x: x.split(', '))\ndata['country'] = data['country'].apply(lambda x: x.split(', '))\ndata = data.explode('genre')\ndata = data.explode('country')\n\n"

In [4]:
nets = SemanticNetworks(data = data,
                        text_var = 'description',
                        index_var = 'imdb',
                        extract_terms=True,
                        terms_embedding=True,
                        docs_embedding=True,
                        sample_size_terms=2000,
                        terms_limit=2000,
                        terms_ents=False,
                        terms_ngrams=(2, 2),
                        terms_ncs=False,
                        terms_include_pos=["NOUN", "PROPN", "ADJ"],
                        terms_include_types=["PERSON", "ORG"],
                        terms_embedding_model="all-MiniLM-L6-v2",
                        docs_embedding_model="all-MiniLM-L6-v2",
                        language="en",
                        terms_path=None,
                        docs_dimension_reduction = 5,
                        terms_embeddings_path=None,
                        docs_embeddings_path=None,
                        docs_multiprocessing = True,
                        terms_multiprocessing = True)

  0%|                                                                                   | 0/1991 [00:00<?, ?it/s]2022-11-18 17:55:06,719 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-11-18 17:55:06,766 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-11-18 17:55:06,805 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-11-18 17:55:06,806 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-11-18 17:55:06,845 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-11-18 17:55:06,882 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
  0%|                                                                         | 1/1991 [00:05<3:06:49,  5.63s/it]2022-11-18 17:55:06,955 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-11-18 17:55:06,979 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
100%|███████████████████████████████████████████████████████████████████████| 1991/1991 [00:08<00:00, 241.43it/s]
2022-11-18 1

Start Embedding...


2022-11-18 17:55:12,206 - INFO : Use pytorch device: cpu
2022-11-18 17:55:12,207 - INFO : CUDA is not available. Start 4 CPU worker
2022-11-18 17:55:12,207 - INFO : Start multi-process pool on devices: cpu, cpu, cpu, cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Fri Nov 18 17:55:23 2022 Finding Nearest Neighbors
Fri Nov 18 17:55:24 2022 Finished Nearest Neighbor Search
Fri Nov 18 17:55:25 2022 Construct embedding


Epochs completed: 100%| █████████████████████████████████████████████████████████████████████████ 500/500 [00:01]

Fri Nov 18 17:55:28 2022 Finished embedding





#### Draw the Networks

In [5]:
fig = nets.fit_draw(top_n=500,
                    variables = ['main form'], 
                    global_filter=0.2, # Minimum cosine for two nodes to be connected
                    n_neighbours=8, # minimum number of neightbours a node is directed to
                    method="node2vec", # Use Node2Vec or force_directed algorithm
                    n_cluster=15,
                    bin_number=30,
                    black_hole_force=1, # Add a force to make the clusters aggregate
                    color="community", # 'community' for clusters or 'entity' for node_type
                    size="size",
                    symbol="entity",
                    textfont_size=9,
                    edge_size=0.5,
                    height=2000,
                    width=2000,
                    template="plotly_dark")

Computing transition probabilities: 100%|███████████████████████████████████| 404/404 [00:00<00:00, 12899.07it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Computing transition probabilities: 100%|████████████████████████████████████| 419/419 [00:00<00:00, 3025.04it/s]

Generating walks (CPU: 1):   0%|          | 0/1 [00:00<?, ?it/s]
Generating walks (CPU: 2):   0%|          | 0/1 [00:00<?, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 1/1 [00:00<00:00, 12.41it/s]
Generating walks (CPU: 2): 100%|██████████| 1/1 [00:00<00:00, 12.64it/s]
Generating walks (CPU: 5):   0%|          | 0/1 [00:00<?, ?it/s]
Generating walks (CPU: 3): 100%|██████████| 1/1 [00:00<00:00, 12.50it/s]
Generating walks (CPU: 4): 100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
Generating walks (CPU: 6): 100%|██████████| 1/1 [00:00<00:00, 12.77it/s]
Generating walks (CPU: 7): 100%|██████████| 1/1 [00:00<00:00, 12.80it/s]
Generating walks (CPU: 10): 100%|██████████| 1/1 [00:00<00:00, 12.93it/s]

#### Save the plots as an html file

In [6]:
import plotly 

plotly.offline.plot(fig, filename = 'filename.html', auto_open=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'filename.html'










