# NetworkX

In [5]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import node_classification
import matplotlib.pyplot as plt

## Import dataframe from CSV

In [6]:
#Combined dataframe from years 2021 to 2011
df_main = pd.read_csv("csv_data/combined_2021_2011.csv")

#Dataframe of topic lists and frequencies
df_topics = pd.read_csv("csv_data/PhraseFrequency.csv", index_col=0)
topicList = df_topics['phrase'].tolist()

# Test run
df_2020 = pd.read_csv("csv_data/df_2020_filtered.csv")
df_2020_topics = pd.read_csv("csv_data/df_2020_topics.csv")

In [29]:
df_2020_topics

Unnamed: 0.1,Unnamed: 0,phrase,frequency
0,0,machine learning,181
1,1,computer science,161
2,2,deep learning,56
3,3,covid-19 pandemic,54
4,4,edge computing,50
...,...,...,...
3277,3277,cryptography distributed,3
3278,3278,distributed cryptography,3
3279,3279,uhf rfid,3
3280,3280,rfid tags,3


## Generate nodes of topic phrases

In [30]:
"""
Generate nodes, edges and the weight of frequency by counting the occurences 
of the topic lists in the main data, iterated row by row.
Args:
df - the main dataframe
phraseList - the list of phrases, or nodes we are working with
nodeNumber - number of nodes to generate. This number corresponds to the number of
             topic phrases we want to consider in the phraseList
"""
def node_generator(df, topicList, nodeNumber): 
    
    # Create an undirectional node graph
    G = nx.Graph()
    
    # Create a copy of df
    df1 = df.copy()
    
    # Create topicList with specified number equivalent to nodeNumber
    topicPhrase = topicList['phrase'][:nodeNumber]
    
    # Iterate through the top frequent words that we have generated previously
    for i in range(len(topicPhrase)):
        
        # Creating individual nodes with its frequency
        G.add_node(topicPhrase[i], weight=topicList['frequency'][i])
        
        for j in range(i + 1, len(topicPhrase)):
            count = 0
            # Iterate through dataframe
            for index, row in df1.iterrows():
                text_data = ('').join(row['abstract']) + ('').join(row['title'])
                # Since the file is reading in as string types, I want to remove ", and "'"
                text_data = text_data.replace(',', '').replace('[', '').replace(']', '').replace("'", "")
                if topicPhrase[i] in text_data and topicPhrase[j] in text_data:
                    count += 1
                    
            # If there was an occurance of two topics
            if (count > 0):
                G.add_edge(topicPhrase[i], topicPhrase[j], weight=count)
        
    return G

## Graph NetworkX

In [55]:
def graph_networkX(G):
    # Create positions of all nodes and save them
    pos = nx.spring_layout(G)

    # Draw the graph according to node positions
    nx.draw(G, pos, with_labels=True)

    # Create edge labels
    labels = {e: G.edges[e]['weight'] for e in G.edges}

    # Draw edge labels according to node positions
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)

### Testing with data from 2020

In [50]:
# Using 10 topic lists
G_2020 = node_generator(df_2020, df_2020_topics, 10)
# G_2020.edges(data=True)

In [58]:
nx.pagerank(G_2020)

{'machine learning': 0.2547209648644971,
 'computer science': 0.0691283362504466,
 'deep learning': 0.11214974452290022,
 'covid-19 pandemic': 0.05716540183563763,
 'edge computing': 0.06818123547062672,
 'wireless networks': 0.09236014021610728,
 'artificial intelligence': 0.1353541095336169,
 'reinforcement learning': 0.10083061950958351,
 'iot devices': 0.06221373465149535,
 'social media': 0.0478957131450886}

In [None]:
plt.rcParams['figure.figsize'] = [30,25]
graph_networkX(G_2020)

In [51]:
# Using 300 topic list 
# Took 1.5 hrs...
G1_2020 = node_generator(df_2020, df_2020_topics, 300)

In [60]:
G1_2020.edges(data=True)

EdgeDataView([('machine learning', 'computer science', {'weight': 8}), ('machine learning', 'deep learning', {'weight': 15}), ('machine learning', 'covid-19 pandemic', {'weight': 3}), ('machine learning', 'edge computing', {'weight': 9}), ('machine learning', 'wireless networks', {'weight': 11}), ('machine learning', 'artificial intelligence', {'weight': 18}), ('machine learning', 'reinforcement learning', {'weight': 10}), ('machine learning', 'iot devices', {'weight': 8}), ('machine learning', 'social media', {'weight': 4}), ('machine learning', 'underrepresented groups', {'weight': 8}), ('machine learning', 'public health', {'weight': 3}), ('machine learning', 'cloud computing', {'weight': 8}), ('machine learning', 'learning techniques', {'weight': 19}), ('machine learning', 'internet things', {'weight': 10}), ('machine learning', 'cyber-physical systems', {'weight': 7}), ('machine learning', 'data analytics', {'weight': 8}), ('machine learning', 'edge devices', {'weight': 6}), ('mac

In [59]:
nx.pagerank(G1_2020)

{'machine learning': 0.023713308843125412,
 'computer science': 0.01286021049747699,
 'deep learning': 0.008321047542602954,
 'covid-19 pandemic': 0.007377188805262914,
 'edge computing': 0.006257023853065386,
 'wireless networks': 0.008180005205607611,
 'artificial intelligence': 0.010769203931452078,
 'reinforcement learning': 0.006633298309800529,
 'iot devices': 0.0076092163679093465,
 'social media': 0.0027527862204767355,
 'underrepresented groups': 0.009758887483411634,
 'public health': 0.005260966805821124,
 'cloud computing': 0.0054683919692828965,
 'learning techniques': 0.007131283761531179,
 'internet things': 0.009355822121425248,
 'cyber-physical systems': 0.005523367971302397,
 'data analytics': 0.00588557360681374,
 'edge devices': 0.004235320741476775,
 'learning algorithms': 0.005896631869414407,
 'big data': 0.004197929245220132,
 'computer information': 0.001323741546711047,
 'wireless network': 0.010389292233718596,
 'science engineering': 0.005223189176749759,
 '

In [None]:
plt.rcParams['figure.figsize'] = [100,80]
graph_networkX(G1_2020)

### Using the entire dataset from 2011 to 2021

In [None]:
G_total = node_generator(df_main, topicList, 20)

In [None]:
graph_networkX(G_total)