In [1]:
import cudf
import cugraph
import networkx as nx
import community as community_louvain
import plotly.express as px
import cupy as cp


In [None]:


columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell", "su_attempted", 
    "num_root", "num_file_creations", "num_shells", "num_access_files", 
    "num_outbound_cmds", "is_host_login", "is_guest_login", 
    "count", "srv_count", "serror_rate", "srv_serror_rate", 
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", 
    "dst_host_srv_count", "dst_host_same_srv_rate", 
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", 
    "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
    "dst_host_srv_rerror_rate", "label"
]
df = cudf.read_csv('kddcup.data', names=columns,header=None)

In [3]:
df.columns
print(df.head(3))

   duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        215      45076     0   
1         0           tcp    http   SF        162       4528     0   
2         0           tcp    http   SF        236       1228     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   0   
1               0       0    0  ...                   1   
2               0       0    0  ...                   2   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     0.0                     0.0   
1                     1.0                     0.0   
2                     1.0                     0.0   

   dst_host_same_src_port_rate  dst_host_srv_diff_host_rate  \
0                          0.0                          0.0   
1                          1.0                          0.0   
2                          0.5                          0.0   

   dst_host_serr

In [None]:
edges_df = df[["src_bytes", "dst_bytes", "duration"]]
edges_df = edges_df.rename(columns={"src_bytes": "src", "dst_bytes": "dst", "duration": "weights"})

In [None]:
G = cugraph.Graph(directed=False)  
G.from_cudf_edgelist(edges_df, source='src', destination='dst', edge_attr='weights')

In [None]:
# Graph Analysis

pagerank_scores = cugraph.pagerank(G)
print(pagerank_scores)

           vertex  pagerank
0             330  0.003373
1             331  0.003498
2             329  0.005085
3             332  0.004492
4             328  0.003575
...           ...       ...
22472   621568663  0.000257
22473   693375640  0.000015
22474  1167519497  0.000233
22475  1309937401  0.000084
22476  1379963888  0.000243

[22477 rows x 2 columns]




In [8]:
louvain_results, modularity_score = cugraph.louvain(G)
print("Louvain Clustering Results:\n", louvain_results)
print("Modularity Score:", modularity_score)

Louvain Clustering Results:
            vertex  partition
0             330          0
1             331          0
2             329         45
3             332         34
4             328          0
...           ...        ...
22472   621568663         83
22473   693375640         83
22474  1167519497         83
22475  1309937401         83
22476  1379963888         83

[22477 rows x 2 columns]
Modularity Score: 0.9873023210735701


In [None]:
louvain_results = louvain_results.rename(columns={"vertex": "node", "partition": "cluster"})
nodes_df = louvain_results

positions = cugraph.force_atlas2(G) 
positions_df = cudf.DataFrame({
    "node": cp.asarray(positions['vertex']),
    "x": cp.asarray(positions['x']),
    "y": cp.asarray(positions['y'])
})
visualization_df = positions_df.merge(nodes_df, on="node")
fig = px.scatter(
    visualization_df.to_pandas(),  
    x="x",
    y="y",
    color="cluster",
    title="Louvain Clustering Visualization",
    labels={"cluster": "Cluster"},
    hover_data=["node"]
)

fig.update_traces(marker=dict(size=10, line=dict(width=2, color='DarkSlateGrey')))
fig.update_layout(showlegend=True)
fig.show()

In [None]:
louvain_results = louvain_results.rename(columns={"vertex": "node", "partition": "cluster"})
edges = G.view_edge_list()
nx_graph = nx.Graph()

src = edges["src"].to_arrow().to_pylist()
dst = edges["dst"].to_arrow().to_pylist()
weights = edges["weights"].to_arrow().to_pylist()

for s, d, w in zip(src, dst, weights):
    nx_graph.add_edge(s, d, weight=w)

positions = nx.spring_layout(nx_graph)

positions_df = cudf.DataFrame({
    "node": list(positions.keys()),
    "x": [pos[0] for pos in positions.values()],
    "y": [pos[1] for pos in positions.values()]
})

visualization_df = positions_df.merge(louvain_results, on="node")

# Visualize clustering using Plotly
fig = px.scatter(
    visualization_df.to_pandas(),  
    x="x",
    y="y",
    color="cluster",
    title="Louvain Clustering Visualization with Spring Layout",
    labels={"cluster": "Cluster"},
    hover_data=["node"]
)
fig.update_traces(marker=dict(size=10, line=dict(width=2, color='DarkSlateGrey')))
fig.update_layout(showlegend=True)
fig.show()