> ## Package Importment
> Importing all necessary packaged beforehand to eliminate repeated import statements.

In [4]:
import os
import pandas as pd
import networkx as nx
from pyvis.network import Network

> ## Global Variables and General Purpose Functions

In [3]:
"""
    GLOBAL VARIABLES
"""

# paths to the directories
INPUT   = "../input"
OUTPUT  = "../output"

# set exclude list
EXCLUDES = [".DS_Store", 'domain_sample.csv']

# column names
COLUMNS = ["Source", "Target", "NodeWeight", "EdgeWeight", "Protocol", "Subdomain", "DomainName", "Extension", "Path", "Metadata"]

# metadata column names
METACOLUMNS = ["Domain", "IpAddress", "Status", "Message", "Continent", "ContinentCode", "Country", "Region", "RegionName", "City", "Zip", "Lat", "Lon", "Timezone", "Isp", "As", "AsName", "Scrapable"]

# default edge weight boundary
EDGE_WEIGHT_BOUNDARY = 0


In [15]:
"""GENERAL PURPOSE FUNCTIONS"""

def read_urls(path: str) -> pd.DataFrame:

    # initiate an empty dataframe to hold all data
    df = pd.DataFrame(columns = COLUMNS)

    # loop through each folder
    for domain in [folder for folder in os.listdir(f"{INPUT}") if folder not in EXCLUDES]:
        
        try:
            # read url.json file to dataframe
            df_temp = pd.read_json(f"{path}/{domain}/urls.json", dtype="str")
        except:
            continue
        
        # add additional columns
        df_temp["NodeWeight"] = 1
        df_temp["EdgeWeight"] = 1

        # add source column with folder name
        df_temp["Source"] = domain

        # remove port number from extension
        df_temp.Extension = df_temp.Extension.apply(lambda x: x.split(":")[0])

        # merge domain_name and extension column into one and save as target column
        df_temp["Target"] = df_temp[["DomainName", "Extension"]].apply(lambda x: ".".join(x), axis=1)

        # append to the main dataframe
        df = pd.concat([df, df_temp])

    # remove self links
    df = df[df.Source != df.Target]

    # sort value in asc order according to Source
    df = df.sort_values(by=["Source"])

    # correct index order
    df = df.reset_index(drop=True)

    # store the dataframe into csv
    df.to_csv(f"{OUTPUT}/csv/urls_as_dataframe.csv")

    return df


def find_language_count(languages: dict, language: str) -> tuple:
    
    # initialize counter
    counter = 0

    for key, value in languages.value_counts().items():
        # select all values for that has selected language keyword
        if language in key.lower(): 
            counter += value
    
    return (language, counter)

## Load Url Datas

In [10]:

if os.path.exists(f"{OUTPUT}/csv/urls_as_dataframe.csv"):

    # read saved csv data
    df = pd.read_csv(f"{OUTPUT}/csv/urls_as_dataframe.csv", index_col=0)

else:
    
    # gather path to the file
    path_to_data = input("Path to the data: ")

    # generate dataframe from urls
    df = read_urls(path_to_data)
    
# view dataframe
df.head()

Unnamed: 0,Source,Target,NodeWeight,EdgeWeight,Protocol,Subdomain,DomainName,Extension,Path,Metadata
0,000.az,texnar.az,1,1,https,www,texnar,az,/az/muellifler/eysar-ahmedov,{'Context': '\n\r\t\n2022 yaş\n\r\t\nIT Manage...
1,000.az,texnar.az,1,1,https,www,texnar,az,/az/teqler/İT sərgi,{'Context': '\n\r\t\nqısa məlumat\n\r\t\nqısa ...
2,000.az,texnar.az,1,1,https,www,texnar,az,/az/teqler/TV,{'Context': '\n\r\t\nApple tvOS üçün də cuzi y...
3,000.az,texnar.az,1,1,https,www,texnar,az,/az/teqler/Apple Pencil,{'Context': '\n\r\t\nApple iPadOS 14-ü cüzi ye...
4,000.az,texnar.az,1,1,https,www,texnar,az,/az/teqler/OS,{'Context': '\n\r\t\nYeni Mac Studio haqqında ...


## Analyze Language Data

In [19]:
# process metadata to extract languages
languages = df.Metadata.apply(lambda x: x.split(":")[-1])

# view languages
languages.head()

0     'az'}
1     'az'}
2     'az'}
3     'az'}
4     'az'}
Name: Metadata, dtype: object

## VISUALIZATION

In [15]:
# calculate edge weights
edge_weights = df.groupby(["Source", "Target"]).EdgeWeight.count()

# calculate node weights as incoming links
node_weights = df.groupby("Target").NodeWeight.sum()

# add fixed weight for other not active nodes
for node in df.Source.unique().tolist() + df.Target.unique().tolist():
    try:
        node_weights[node]
    except:
        s_temp = pd.Series([1], index=[node])
        pd.concat([node_weights, s_temp])

# find max of the edge weights
edge_weights_max = max(edge_weights.values)

# find max of the node weights
node_weights_max = max(node_weights.values)

## Generate Graph

In [11]:
# initialize graph
graph = nx.DiGraph()

In [31]:
# find average edge weight
EDGE_WEIGHT_BOUNDARY = edge_weights.std()

included_nodes = {}

# add edge
for _from, _to in edge_weights.index:
    if edge_weights[_from][_to] >= 1:

        # add edge
        graph.add_edge(_from, _to, weight=round(edge_weights[_from][_to] / edge_weights_max, 2))

        # store used nodes
        included_nodes[_from], included_nodes[_to] = True, True


# loop to add edge and node
for node, node_weight in zip(node_weights.index, node_weights.values):
    
    # add only used selected nodes
    if node in included_nodes.keys():

        # add node  
        graph.add_node(node, title=node.split(".")[0])

## Apply Page Rank

In [13]:
page_rank = nx.pagerank(graph, alpha=0.9)

In [23]:
# sort ranked pages from most to least
page_rank_desc_sorted = {k: v for k, v in sorted(page_rank.items(), key=lambda x: x[1], reverse=True)}

# view top page ranks
print(page_rank_desc_sorted)

{}


## Visualize with  PyVis

In [32]:
# generate network graph visuality
nt = Network("100%", "100%", directed=True)

# populates the nodes and edges data structures
nt.from_nx(graph)

# save as html
nt.show(f"{OUTPUT}/html/graph_with_boundary_{EDGE_WEIGHT_BOUNDARY}.html")