Install required packages

In [None]:
#!pip install pandas scikit-learn sentence-transformers pyvis networkx xlsxwriter

sentence-transformers package needs torch, tensorflow or jax available as backend
I used torch

In [None]:
#!pip install torch

In [None]:
import pandas as pd

In [None]:
ISSUES_FILE = './all_issues.csv'
issue_data = pd.read_csv(ISSUES_FILE)
issue_data.head()

In [None]:
issue_data["name"] = issue_data["ID"] + " - " + issue_data["WG"]
issue_data["text"] = issue_data["Title"] + " - " + issue_data["Description"]
issue_data.head()

In [None]:
issue_data_all = issue_data.copy()
issue_data.drop_duplicates(subset=['ID', 'WG'], inplace=True, ignore_index=True)
issue_data.head()

Use pretrained language model to compute semantic textual similarity

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L12-v1')

In [None]:
issue_data["Embedding"] = issue_data["text"].apply(lambda x: model.encode(x))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(issue_data["Embedding"].tolist())

Add edges to graph.
An edge existst between (i) and (j) iff issue represented by (j) is most similar to (i).

In [None]:
import networkx as nx
components_graph = nx.DiGraph()

components_graph.add_nodes_from([
    (tup.Index, {
        'title': tup.text,
        'label': tup.name,
        'group': tup.WG,
        'size': 25
    })
    for tup in issue_data.itertuples()
    
])

for src in range(len(issue_data.index)):
    highest_similarity = np.argsort(-similarity[src,])[1:2] # self has highest similarity
    for dest in highest_similarity:
        weight_val = float(similarity[src,dest])
        components_graph.add_edge(src, int(dest), weight=weight_val, value=weight_val, arrowStrikethrough=False)

rankings = nx.pagerank(nx.MultiGraph(components_graph), weight='weight')
_min = min(rankings.values())
_max = max(rankings.values())

for node_id, rank in rankings.items():
    components_graph.nodes[node_id]['importance'] = 15 + (rank - _min) / (_max - _min) * 10

Visualize the resulting graph.

In [None]:
from pyvis import network as net
nt = net.Network(notebook=True, height='900px', width='100%', directed=True)
nt.barnes_hut(spring_length=100, spring_strength=0.1, central_gravity=8, overlap=1)
nt.from_nx(components_graph)
nt.show('test.html')

Identify weakly connected components (i.e. clusters) in the graph and save them in a new dataframe.

In [None]:
components_df = pd.concat([
    pd.DataFrame({
        'name': [components_graph.nodes[node_id]['label'] for node_id in comp],
        'importance': [components_graph.nodes[node_id]['importance'] for node_id in comp],
        'component': index
    })
    for index, comp in enumerate(sorted(nx.weakly_connected_components(components_graph), key=len, reverse=True), start=1)
])
components_df.head()

In [None]:
issue_data_final = issue_data_all.merge(components_df)
issue_data_final.head()

Export the components to excel, for better sharing.

In [None]:
with pd.ExcelWriter('identified_clusters.xlsx') as writer:
    for component_number in sorted(issue_data_final['component'].unique()):
        component_data = (issue_data_final[issue_data_final['component'] == component_number]
                          .sort_values(['component', 'importance'], ascending=[True, False])
                          .drop(columns=['name', 'text', 'importance', 'component'])
                         )
        
        component_data.to_excel(writer, index=None, sheet_name=f"Cluster {component_number}")

*Additional:* query the dataset for the most similar issue to any given issue.

In [None]:
queryId = "E1 - technical"
top_K = 5

index_row = issue_data[issue_data['name'] == queryId].index.tolist()[0]
similar_indices = np.argsort(-similarity[index_row,])[1:top_K+1]

issue_data.iloc[similar_indices].itertuples()
print(f"Query: {issue_data.loc[index_row].text}")
print("-----")
print("Most similar issues:")
for row in issue_data.iloc[similar_indices].itertuples():
        print(f"{row.name}: {row.text}")
        print(f"(Similarity: {similarity[index_row, row.Index]:.4f})")
        print("")