In [None]:
def get_dir_path(s: str):
    return "docs/keyword_analysis/all_%s.html" % s

proportions = dict(width=1600, height=1000)
xl_proportions = dict(width=2000, height=1400)

In [None]:
from requests import head
import pandas as pd
from pathlib import Path
import re

files = Path("metadata/keywords").glob("*.csv")
df = []
for file in files:
    try:
        file_df = pd.read_csv(file)
        df.append(file_df)
    except:
        print(f"unable to read file {file}")

df = pd.concat(df)
df.source = df.source.apply(lambda x: x.split('.')[1])
df['project'] = df.apply(lambda row: f"{row['author']}/{row['repo']}/{row['version']}", axis=1)
df['base_url'] = df['url'].apply(lambda x: re.sub(r'#:~:text.+', '', x))
df.head()

In [None]:
summary_df = df.groupby(["quality_attribute", "project"]).size().reset_index(name="count")
summary_df = summary_df.sort_values(["project", "count"], ascending=False)
summary_df.to_csv("metadata/repo_info/summary_by_project_and_quality_attribute.csv", index=False)
summary_df

In [None]:
summary_df = df.groupby(["project"]).size().reset_index(name="count")
summary_df = summary_df.sort_values(["count"], ascending=False)
summary_df.to_csv("metadata/repo_info/summary_by_project.csv", index=False)
summary_df

In [None]:
summary_df = df.groupby(["quality_attribute"]).size().reset_index(name="count")
summary_df = summary_df.sort_values(["count"], ascending=False)
summary_df.to_csv("metadata/repo_info/summary_by_quality_attribute.csv", index=False)
summary_df

In [None]:
summary_df = df.groupby(["quality_attribute"]).size().reset_index(name="count")
summary_df = summary_df.sort_values(["count"], ascending=False)
summary_df['count'].sum()

In [None]:
import plotly.express as px

# Group by 'Quality Attribute' and 'source', counting keyword matches
matrix_df = df.groupby(['quality_attribute', 'source', 'project']).size().reset_index(name='count')

# Plot grouped bar chart
fig = px.bar(matrix_df, x='quality_attribute', y='count', color='source', barmode='group',
             labels={'quality_attribute': 'Quality Attribute', 'count': 'Keyword Matches'},
             animation_frame="project",
             title='Source vs. Quality Attribute Matrix')

fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("source_vs_quality_attr_matrix"))


In [None]:
scatted_df = df
scatted_df['sentence_length'] = df['sentence'].apply(lambda x: len(x))
scatted_df = scatted_df.groupby(['quality_attribute', 'source', 'matched_word']).agg(count=('keyword', 'size'), avg_sentence_length=('sentence_length', 'mean')).reset_index()
scatted_df.head()
scat = px.scatter(scatted_df, x='avg_sentence_length', y='matched_word', color='source', size='count', title='Scatter Plot of Quality Attributes by Source')
scat.update_traces(marker=dict(sizemode='area', sizeref=1, sizemin=1))
scat.update_layout(xaxis=dict(tickangle=45), legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1.02), **proportions)
scat.show()

In [None]:
fig = px.histogram(df, y='keyword', color='source', animation_frame="project", title='Histogram of Keyword Matches by Source')
fig.update_yaxes(categoryorder='total ascending')
fig.update_layout(width=1600, height=2000)
fig.show()

fig.write_html(get_dir_path("histogram_of_keyword_matches_by_source_animated"))

In [None]:
fig = px.histogram(df, y='keyword', color='source', title='Histogram of Keyword Matches by Source')
fig.update_yaxes(categoryorder='total ascending')
fig.update_layout(width=1600, height=2000)
fig.show()

fig.write_html(get_dir_path("histogram_of_keyword_matches_by_source"))

In [None]:
fig = px.histogram(df, y='keyword', color='quality_attribute', title='Histogram of Keyword Matches by Source')
fig.update_yaxes(categoryorder='total ascending')
fig.update_layout(width=1600, height=2000)
fig.show()

fig.write_html(get_dir_path("histogram_of_keyword_matches_by_quality_attribute"))

In [None]:
# Group and count by source and quality attribute
stacked_df = df.groupby(['source', 'quality_attribute']).size().reset_index(name='count')

# Plot stacked bar chart
fig = px.bar(stacked_df, x='quality_attribute', y='count', color='source',
             labels={'source': 'source', 'count': 'Keyword Matches'},
             title='Stacked Bar Chart of Quality Attributes by Source')
fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("stacked_bar_chart_quality_attr_to_sources"))


In [None]:
# Count keywords within each quality attribute
treemap_df = df.groupby(['quality_attribute', 'project', 'keyword', 'matched_word']).size().reset_index(name='count')

# Plot treemap
fig = px.treemap(treemap_df, path=['project', 'quality_attribute', 'keyword', 'matched_word'], values='count', color="quality_attribute", color_discrete_sequence=px.colors.qualitative.Pastel,
                 title='Matched Word Frequency per Quality Attribute Treemap')

fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("treemap_quality_attr_to_matched_words"))


In [None]:
# Count keywords within each quality attribute
treemap_df = df.groupby(['quality_attribute', 'project', 'keyword', 'matched_word']).size().reset_index(name='count')

# Plot treemap
fig = px.treemap(treemap_df, path=['quality_attribute', 'project', 'keyword', 'matched_word'], values='count', color="project", color_discrete_sequence=px.colors.qualitative.Pastel,
                 title='Matched Word Frequency per Quality Attribute Treemap')

fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("treemap_quality_attr_to_matched_words_by_project"))


In [None]:
# Count keywords within each quality attribute
treemap_df = df.groupby(['quality_attribute', 'source', "project", 'keyword', 'matched_word']).size().reset_index(name='count')

# Plot treemap
fig = px.treemap(treemap_df, path=['source', 'quality_attribute', 'project', 'keyword', 'matched_word'], values='count', color='quality_attribute', color_discrete_sequence=px.colors.qualitative.Pastel,
                 title='Matched Word Frequency per Quality Attribute Treemap')

fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("treemap_quality_attr_to_matched_words_with_source"))


In [None]:
# Count keywords within each quality attribute
treemap_df = df.groupby(['quality_attribute', 'project', 'source', 'keyword', 'matched_word', 'sentence']).size().reset_index(name='count')

# Plot treemap
fig = px.treemap(treemap_df, path=['quality_attribute', 'project', 'source', 'keyword', 'matched_word', 'sentence'], values='count', color='source', color_discrete_sequence=px.colors.qualitative.Pastel,
                 title='Matched Word Frequency per Quality Attribute Treemap')

fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("treemap_quality_attr_to_matched_words_with_source_and_sentences"))


In [None]:
radar_df = df.groupby(['quality_attribute', 'project']).size().reset_index(name="count")
radar_df.head(15)

In [None]:
fig = px.line_polar(radar_df, r="count", theta='quality_attribute', animation_frame="project", line_close=True,
                    title='Quality Attribute Radar Chart by Source', markers=True)

fig.update_traces(fill='toself')
fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("radar_chart_quality_attr"))

In [None]:
import networkx as nx
import plotly.graph_objects as go

# Create edges between keywords and quality attributes
edges = df[['keyword', 'quality_attribute']].drop_duplicates()
G = nx.from_pandas_edgelist(edges, 'keyword', 'quality_attribute')

# Plot with Plotly
pos = nx.spring_layout(G)
edge_trace = go.Scatter(x=[], y=[], line=dict(width=1, color='#888'), hoverinfo='none', mode='lines')

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += (x0, x1, None)
    edge_trace['y'] += (y0, y1, None)

node_trace = go.Scatter(x=[], y=[], text=[], mode='markers+text', hoverinfo='text',
                        marker=dict(showscale=True, color=[], size=10, colorbar=dict(thickness=15)))

for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] += (x,)
    node_trace['y'] += (y,)
    node_trace['text'] += (node,)
    node_trace['marker']['color'] += (1 if node in edges['quality_attribute'].values else 0,)

fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout(title='Keyword and Quality Attribute Network Graph'))
fig.update_layout(**proportions)
fig.show()


In [None]:
# Count keywords per quality attribute per source
hist_df = df.groupby(['quality_attribute', 'source']).size().reset_index(name='count')

# Plot histogram
fig = px.histogram(hist_df, x='quality_attribute', y='count', color='source',
                   barmode='group', title='Comparative Histogram of Keyword Matches by Source')

fig.update_layout(**proportions)
fig.show()

fig.write_html(get_dir_path("comparative_histogram_quality_attr_to_sources"))


In [None]:
sunburst_df = df.groupby(['quality_attribute', 'source', 'keyword']).size().reset_index(name='count')
sunburst_df.head()


In [None]:
# Group data by quality attribute, source, and repository
sunburst_df = df.groupby(['quality_attribute', 'source', 'keyword']).size().reset_index(name='count')

# Plot sunburst
import plotly.express as px

fig = px.sunburst(sunburst_df, path=['quality_attribute', 'source', 'keyword'], values='count',
                  title='Parallel Sets Diagram of Quality Attributes, Sources, and Keywords')
fig.update_layout(**proportions)
fig.show()
fig.write_html(get_dir_path("parallel_sets_diagram_quality_attr_to_keywords"))


In [None]:
# Group data by quality attribute, source, and repository
sunburst_df = df.groupby(['quality_attribute', 'source', 'matched_word']).size().reset_index(name='count')

# Plot sunburst
import plotly.express as px

fig = px.sunburst(sunburst_df, path=['quality_attribute', 'source', 'matched_word'], values='count',
                  title='Parallel Sets Diagram of Quality Attributes, Sources, and Matched words')
fig.update_layout(**proportions)
fig.show()
fig.write_html(get_dir_path("parallel_sets_diagram_quality_attr_to_matched_words"))


In [None]:
# Group data by quality attribute, source, and repository
import re
sunburst_df = df.groupby(['keyword', "base_url", 'quality_attribute', 'source', 'project']).size().reset_index(name='count')
sunburst_df.sort_values(['keyword', 'source'])

In [None]:
sunburst_df = sunburst_df.groupby(['keyword', 'quality_attribute', 'source', 'project']).size().reset_index(name='count')
sunburst_df.sort_values(['keyword', 'source'])

In [None]:
fig = px.sunburst(sunburst_df, path=['quality_attribute', 'source', 'keyword', 'project'], values='count',
                  title='Parallel Sets Diagram of Quality Attributes, Sources, and Matched words')
fig.update_layout(**proportions)
fig.show()
fig.write_html(get_dir_path("parallel_sets_diagram_quality_attr_to_matched_words"))

In [None]:
# Group data by quality attribute, source, and repository
sunburst_df = df.groupby(['quality_attribute', 'source', 'keyword', 'matched_word']).size().reset_index(name='count')

# Plot sunburst
import plotly.express as px

fig = px.sunburst(sunburst_df, path=['quality_attribute', 'source', 'keyword', 'matched_word'], values='count',
                  title='Parallel Sets Diagram of Quality Attributes, Sources, and Matched words')
fig.update_layout(**proportions)
fig.show()
fig.write_html(get_dir_path("parallel_sets_diagram_quality_attr_to_keywords_with_matched_words"))


In [None]:
sankey_df = df.groupby(['quality_attribute', 'source', 'keyword']).size().reset_index(name='count')

# Create a list of unique nodes (quality attributes, sources, keywords)
nodes = pd.concat([sankey_df['quality_attribute'], sankey_df['source'], sankey_df['keyword']]).unique()
node_dict = {node: i for i, node in enumerate(nodes)}  # Map each node to a unique index

# Define source and target nodes for the Sankey diagram based on the groupings
sankey_df['quality_attr_index'] = sankey_df['quality_attribute'].map(node_dict)
sankey_df['source_index'] = sankey_df['source'].map(node_dict)
sankey_df['keyword_index'] = sankey_df['keyword'].map(node_dict)

# Define links from quality_attribute -> source and source -> keyword
source_target_links = (
    pd.concat([
        sankey_df[['quality_attr_index', 'source_index', 'count']].rename(columns={'quality_attr_index': 'source', 'source_index': 'target'}),
        sankey_df[['source_index', 'keyword_index', 'count']].rename(columns={'source_index': 'source', 'keyword_index': 'target'})
    ])
)

# Extract source, target, and count lists for the Sankey plot
sources = source_target_links['source'].tolist()
targets = source_target_links['target'].tolist()
counts = source_target_links['count'].tolist()

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes  # Use unique nodes as labels
    ),
    link=dict(
        source=sources,  # Indices of source nodes
        target=targets,  # Indices of target nodes
        value=counts     # Counts as link values
    )
)])

fig.update_layout(title_text="Quality Attributes, Sources, and Keywords Sankey Diagram", font_size=14, **proportions)
fig.show()

fig.write_html(get_dir_path("sankey_quality_attr_to_sources_and_keywords"))

In [None]:
sankey_df = df.groupby(['quality_attribute', 'source', 'matched_word']).size().reset_index(name='count')

# Create a list of unique nodes (quality attributes, sources, keywords)
nodes = pd.concat([sankey_df['quality_attribute'], sankey_df['source'], sankey_df['matched_word']]).unique()
node_dict = {node: i for i, node in enumerate(nodes)}  # Map each node to a unique index

# Define source and target nodes for the Sankey diagram based on the groupings
sankey_df['quality_attr_index'] = sankey_df['quality_attribute'].map(node_dict)
sankey_df['source_index'] = sankey_df['source'].map(node_dict)
sankey_df['keyword_index'] = sankey_df['matched_word'].map(node_dict)

# Define links from quality_attribute -> source and source -> keyword
source_target_links = (
    pd.concat([
        sankey_df[['quality_attr_index', 'source_index', 'count']].rename(columns={'quality_attr_index': 'source', 'source_index': 'target'}),
        sankey_df[['source_index', 'keyword_index', 'count']].rename(columns={'source_index': 'source', 'keyword_index': 'target'})
    ])
)

# Extract source, target, and count lists for the Sankey plot
sources = source_target_links['source'].tolist()
targets = source_target_links['target'].tolist()
counts = source_target_links['count'].tolist()

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes  # Use unique nodes as labels
    ),
    link=dict(
        source=sources,  # Indices of source nodes
        target=targets,  # Indices of target nodes
        value=counts     # Counts as link values
    )
)])

fig.update_layout(title_text="Quality Attributes, Sources, and Keywords Sankey Diagram", font_size=14, **xl_proportions)
fig.show()

fig.write_html(get_dir_path("sankey_quality_attr_to_sources_and_matched_words"))

In [None]:
# Group data by quality attribute and source, summing the counts
sankey_df = df.groupby(['quality_attribute', 'source']).size().reset_index(name='count')

# Create a list of unique nodes (quality attributes and sources only)
nodes = pd.concat([sankey_df['quality_attribute'], sankey_df['source']]).unique()
node_dict = {node: i for i, node in enumerate(nodes)}  # Map each node to a unique index

# Define source and target nodes for the Sankey diagram
sankey_df['quality_attr_index'] = sankey_df['quality_attribute'].map(node_dict)
sankey_df['source_index'] = sankey_df['source'].map(node_dict)

# Extract source, target, and count lists for the Sankey plot
sources = sankey_df['quality_attr_index'].tolist()
targets = sankey_df['source_index'].tolist()
counts = sankey_df['count'].tolist()

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes  # Use unique nodes as labels
    ),
    link=dict(
        source=sources,  # Indices of source nodes
        target=targets,  # Indices of target nodes
        value=counts     # Counts as link values
    )
)])

fig.update_layout(title_text="Quality Attributes to Sources Sankey Diagram", font_size=14, **proportions)
fig.show()

fig.write_html(get_dir_path("sankey_quality_attr_to_sources"))

In [None]:
# Sort versions and aggregate data by version, source, and quality attribute
trend_df = df.groupby(['version', 'source', 'quality_attribute']).size().reset_index(name='count')
trend_df = trend_df.sort_values(by='version')

# Plot stacked area chart
fig = px.area(trend_df, x='version', y='count', color='quality_attribute', line_group='source',
              title='Stacked Area Chart of Quality Attribute Trends by Source')
fig.show()

