# Semantic Networks

Use networks to study the interactions between categories and terms

In [1]:
from bunkatech.networks import SemanticNetworks
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import warnings
warnings.filterwarnings('ignore')
 
    
data = pd.read_csv('../data/imdb.csv', index_col = [0])
data = data.sample(2000, random_state = 42)

In [2]:
'''
data = data[['imdb', 'description', 'genre', 'country']].copy().dropna()
data['genre'] = data['genre'].apply(lambda x: x.split(', '))
data['country'] = data['country'].apply(lambda x: x.split(', '))
data = data.explode('genre')
data = data.explode('country')

'''

"\ndata = data[['imdb', 'description', 'genre', 'country']].copy().dropna()\ndata['genre'] = data['genre'].apply(lambda x: x.split(', '))\ndata['country'] = data['country'].apply(lambda x: x.split(', '))\ndata = data.explode('genre')\ndata = data.explode('country')\n\n"

In [3]:
nets = SemanticNetworks(data = data,
                        text_var = 'description',
                        index_var = 'imdb',
                        extract_terms=True,
                        terms_embedding=True,
                        docs_embedding=True,
                        sample_size_terms=2000,
                        terms_limit=2000,
                        terms_ents=False,
                        terms_ngrams=(2, 2),
                        terms_ncs=False,
                        terms_include_pos=["NOUN", "PROPN", "ADJ"],
                        terms_include_types=["PERSON", "ORG"],
                        terms_embedding_model="all-MiniLM-L6-v2",
                        docs_embedding_model="all-MiniLM-L6-v2",
                        language="en",
                        terms_path=None,
                        docs_dimension_reduction = 5,
                        terms_embeddings_path=None,
                        docs_embeddings_path=None,
                        docs_multiprocessing = True,
                        terms_multiprocessing = True)

  0%|                                                                 | 0/1991 [00:00<?, ?it/s]2022-03-30 15:43:21,884 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,899 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,911 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,912 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,918 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,923 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,937 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
2022-03-30 15:43:21,942 - INFO : loaded 'en_core_web_sm' spaCy language pipeline
100%|█████████████████████████████████████████████████████| 1991/1991 [00:08<00:00, 222.79it/s]
2022-03-30 15:43:24,628 - INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2022-03-30 15:43:30,743 - INFO : Use pytorch device: cpu


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2022-03-30 15:43:38,842 - INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Start Embedding...


2022-03-30 15:43:45,762 - INFO : Use pytorch device: cpu
2022-03-30 15:43:45,763 - INFO : CUDA is not available. Start 4 CPU worker
2022-03-30 15:43:45,763 - INFO : Start multi-process pool on devices: cpu, cpu, cpu, cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2022-03-30 15:43:50,940 - INFO : Chunk data into packages of size 50


UMAP(n_components=5, verbose=True)
Wed Mar 30 15:44:05 2022 Construct fuzzy simplicial set
Wed Mar 30 15:44:07 2022 Finding Nearest Neighbors
Wed Mar 30 15:44:08 2022 Finished Nearest Neighbor Search
Wed Mar 30 15:44:10 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Wed Mar 30 15:44:12 2022 Finished embedding


#### Draw the Networks

In [7]:
fig = nets.fit_draw(top_n=500,
                    variables = ['main form'], 
                    global_filter=0.2, # Minimum cosine for two nodes to be connected
                    n_neighbours=8, # minimum number of neightbours a node is directed to
                    method="node2vec", # Use Node2Vec or force_directed algorithm
                    n_cluster=15,
                    bin_number=30,
                    black_hole_force=1, # Add a force to make the clusters aggregate
                    color="community", # 'community' for clusters or 'entity' for node_type
                    size="size",
                    symbol="entity",
                    textfont_size=9,
                    edge_size=0.5,
                    height=2000,
                    width=2000,
                    template="plotly_dark")

Computing transition probabilities:   0%|          | 0/398 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1/1 [00:00<00:00, 11.10it/s]
Generating walks (CPU: 2): 100%|██████████| 1/1 [00:00<00:00, 11.19it/s]
Generating walks (CPU: 3): 100%|██████████| 1/1 [00:00<00:00, 11.18it/s]
Generating walks (CPU: 4): 100%|██████████| 1/1 [00:00<00:00, 10.86it/s]
Generating walks (CPU: 5): 100%|██████████| 1/1 [00:00<00:00, 11.11it/s]
Generating walks (CPU: 6): 100%|██████████| 1/1 [00:00<00:00, 11.20it/s]
Generating walks (CPU: 7): 100%|██████████| 1/1 [00:00<00:00, 11.18it/s]
Generating walks (CPU: 8): 100%|██████████| 1/1 [00:00<00:00, 11.23it/s]
Generating walks (CPU: 9): 100%|██████████| 1/1 [00:00<00:00, 10.91it/s]
Generating walks (CPU: 10): 100%|██████████| 1/1 [00:00<00:00, 11.30it/s]
2022-03-30 15:59:22,413 - INFO : collecting all words and their counts
2022-03-30 15:59:22,414 - INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-30 15:59:22,445 - INFO : collected 398 word types from a corpus of 318400 raw words and 398

Computing transition probabilities:   0%|          | 0/413 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1/1 [00:00<00:00, 10.34it/s]
Generating walks (CPU: 2): 100%|██████████| 1/1 [00:00<00:00, 10.56it/s]
Generating walks (CPU: 3): 100%|██████████| 1/1 [00:00<00:00, 10.41it/s]
Generating walks (CPU: 4): 100%|██████████| 1/1 [00:00<00:00, 10.61it/s]
Generating walks (CPU: 5): 100%|██████████| 1/1 [00:00<00:00, 10.42it/s]
Generating walks (CPU: 6): 100%|██████████| 1/1 [00:00<00:00, 10.52it/s]
Generating walks (CPU: 7): 100%|██████████| 1/1 [00:00<00:00, 10.15it/s]
Generating walks (CPU: 8): 100%|██████████| 1/1 [00:00<00:00, 10.53it/s]
Generating walks (CPU: 9): 100%|██████████| 1/1 [00:00<00:00, 10.71it/s]
Generating walks (CPU: 10): 100%|██████████| 1/1 [00:00<00:00, 10.85it/s]
2022-03-30 15:59:35,554 - INFO : collecting all words and their counts
2022-03-30 15:59:35,555 - INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-30 15:59:35,585 - INFO : collected 413 word types from a corpus of 330400 raw words and 413

#### Save the plots as an html file

In [9]:
import plotly 

plotly.offline.plot(fig, filename = 'filename.html', auto_open=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'filename.html'