In [None]:
from constants.abs_paths import AbsDirPath
from processing_pipeline.utilities.data_transformation import load_all_files

in_dir = AbsDirPath.MERGED
df = load_all_files(in_dir)

In [None]:
df.shape

In [None]:
df.head(3)

# Add columns denoting which stage passed

In [None]:
df["prefilter_passed"] = ~df['s0_to_eliminate'].isna()
df['s0_passed'] = df["prefilter_passed"] & (df['s0_to_eliminate'] == False)
df['s1_passed'] = df['s0_passed'] & (df['s1_true_positive'] == True)
df['s2_passed'] = df['s1_passed'] & (df['s2_related_to_arch'] == True)
df['s3_passed'] = df['s2_passed'] & (~df['s3_tactic'].isna()) & (df['s3_tactic'] != "None")

In [None]:
df.columns

# Add column for tactic category

In [None]:
from cfg.tactics.tactic_to_category_map import tactic_to_category_map

df["s3_tactic_category"] = df.s3_tactic.apply(lambda x: tactic_to_category_map.get(x))
df.head()

In [None]:
df.groupby("s3_tactic_category").nsimilar.agg("sum")

# R1
## Primary - A quantitative analysis of the prevalence of each quality attribute.

In [None]:
r1_total_count = df[df.s1_passed].nsimilar.sum()
r1_total_count

In [None]:
r1 = df[df.s1_passed].groupby("qa").agg(nsimilar=("nsimilar","sum")).sort_values("nsimilar", ascending=False)
r1["percentage"] = r1.nsimilar / r1_total_count * 100
r1

In [None]:
import plotly.express as px
px.pie(r1, names=r1.index, values="nsimilar", title="R1")

## Secondary - Repository-Specific Priorities

In [None]:
r1_2 = df[df.s1_passed].pivot_table(index=["repo_id"], columns="qa", values="nsimilar", aggfunc="sum")
r1_2

In [None]:
r1_2_pc = r1_2.div(r1_2.sum(axis=1), axis=0) * 100
r1_2_pc

In [None]:
px.bar(r1_2_pc, title="R1_2")

## Secondary 2 - Keyword Performance Analysis

In [None]:
df.head()

In [None]:
r1_3 = df.pivot_table(index="keyword", columns="s1_passed", values="nsimilar", aggfunc="sum")
r1_3["total"] = r1_3.sum(axis=1)
nkw =  r1_3.total.sum()
r1_3["passed_ratio"] = r1_3[True] / r1_3.total
r1_3["popularity_ratio"] = r1_3.total / nkw
r1_3.sort_values("total", ascending=False)

In [None]:
px.scatter(r1_3, x="popularity_ratio", y="passed_ratio", hover_name=r1_3.index, hover_data=["passed_ratio", "popularity_ratio", "total"])
# px.scatter(r1_3, x="popularity_ratio", y="passed_ratio", hover_name=r1_3.index, hover_data=["passed_ratio", "popularity_ratio", "total"], color="total", color_continuous_scale=px.colors.sequential.Burgyl_r)

# RQ2
## Primary - A strategic analysis of the types of solutions (tactic categories) used for the most important quality attributes. This gives a high-level overview of the solution strategies

In [None]:
data_rq2 = df[df.s3_passed]
data_rq2.shape

In [None]:
top_qas = df[df.s1_passed].groupby("qa").nsimilar.sum().sort_values(ascending=False).head(4)
top_qas = top_qas.index.tolist()
top_qas

In [None]:
r2 = data_rq2[data_rq2.qa.isin(top_qas)]
r2.groupby(["qa", "s3_tactic_category"]).nsimilar.sum().reset_index().sort_values(["qa","nsimilar"], ascending=[False]*2).groupby("qa").head(3)

## Secondary - A detailed "drill-down" to see which specific tactics make up the most important categories identified in the primary analysis.

### TODO

In [None]:
r2 = data_rq2[data_rq2.qa.isin(top_qas)]
r2.groupby(["qa", "s3_tactic"]).nsimilar.sum().reset_index().sort_values(["qa","nsimilar"], ascending=[False]*2).groupby("qa").head(3)

# RQ3
## Primary -  A comparative analysis of discussion topics across different communication channels (source).

In [None]:
data_rq3 = df[df['s3_passed']]
data_rq3.shape

In [None]:
r3 = data_rq3.pivot_table(index="qa", columns="source", values="nsimilar", aggfunc="sum", fill_value=0)
totals = r3.sum(axis=1)
r3 = r3.div(totals, axis=0) * 100
r3


In [None]:
r3_2 = data_rq3.pivot_table(index="s3_tactic_category", columns="source", values="nsimilar", aggfunc="sum", fill_value=0)
totals = r3_2.sum(axis=1)
r3_2 = r3_2.div(totals, axis=0) * 100
r3_2

# Other

In [None]:
df.source.unique()

In [None]:
df.qa.unique()

In [None]:
df.repo_id.unique()

In [None]:
df.columns

# Data funnel

In [None]:
funnel = df.groupby(['prefilter_passed',
       's0_passed', 's1_passed', 's2_passed', 's3_passed']).nsimilar.agg("sum")
funnel

In [None]:
import pandas as pd

# Define the order and names of your funnel stages
stages = {
    'prefilter_passed': '1. Prefilter',
    's0_passed': '2. Stage 0',
    's1_passed': '3. Stage 1',
    's2_passed': '4. Stage 2',
    's3_passed': '5. Stage 3'
}

# This list will hold the data for each stage
funnel_data_list = []

# Process each stage
for stage_col, stage_name in stages.items():
    # 1. Filter for items that passed the current stage
    passed_stage_df = df[df[stage_col] == True]

    # 2. Group by 'source' and sum 'nsimilar'
    stage_summary = passed_stage_df.groupby('source')['nsimilar'].sum().reset_index()

    # 3. Add the stage name for plotting
    stage_summary['Stage'] = stage_name

    funnel_data_list.append(stage_summary)

# 4. Combine all stage data into a single DataFrame
funnel_plot_df = pd.concat(funnel_data_list, ignore_index=True)

print("Reshaped data for plotting:")
print(funnel_plot_df)

In [None]:
px.funnel(funnel_plot_df,
                x='nsimilar',
                y='Stage',
                color='source',
                title='Funnel Analysis by Source')

In [None]:
total_items = df.nsimilar.sum()
funnel_df = funnel.reset_index()
funnel_df["percentage_of_total"] = funnel_df.nsimilar / total_items * 100
funnel_df

## Crosstab

In [None]:
import pandas as pd

df_tactics = df[df['s3_passed'] == True]
tactic_crosstab = pd.crosstab(df_tactics['qa'], df_tactics['s3_tactic'])
tactic_crosstab

In [None]:
import pandas as pd

df_tactics = df[df['s1_passed']]
tactic_crosstab = pd.crosstab(df_tactics['source'], df_tactics['qa'], margins=True)
tactic_crosstab

In [None]:
tactic_crosstab_totals = tactic_crosstab.loc["All"]
tactic_crosstab.div(tactic_crosstab_totals, axis=1) * 100

In [None]:
df.pivot_table(index=["source", "qa"], columns=["s0_to_eliminate", "s1_true_positive", "s2_related_to_arch"], values="nsimilar", aggfunc="sum")

In [None]:
df.pivot_table(index=["source", "qa"], columns=["s0_to_eliminate"], values="nsimilar", aggfunc=["sum"], margins=True)

In [None]:
df.groupby(["s0_to_eliminate", "s1_true_positive", "s2_related_to_arch", "s3_tactic"], dropna=False).nsimilar.agg("sum")

In [None]:
df.groupby(["source", "s0_to_eliminate", "s1_true_positive", "s2_related_to_arch", "s3_tactic"], dropna=False).nsimilar.agg("sum")

In [None]:
df.groupby(["source", "qa", "s0_to_eliminate", "s1_true_positive", "s2_related_to_arch", "s3_tactic"], dropna=False).nsimilar.agg("sum")

In [None]:
df.groupby(["source", "s0_to_eliminate", "s1_true_positive", "s2_related_to_arch", "s3_tactic"]).nsimilar.agg("sum")

In [None]:
df[df["s3_tactic"].notna()]