In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.formula.api as smf
from statsmodels.regression.mixed_linear_model import MixedLMResults
from typing import Literal


from __future__ import annotations
from typing import Any, Literal, Optional


from visualization.scripts.plot_utils import filter_outliers
import helpers.project_config as cfg

##### Data Validation

In [2]:
def validate_data(dataframe: pd.DataFrame, column: str) -> bool:
    """
    Validate that `dataframe[column]` contains only finite numeric values.

    Checks:
      - column exists
      - numeric dtype (ints/floats)
      - no NaN / missing values
      - no +/- inf

    Returns:
        True if all checks pass.

    Raises:
        ValueError: If any check fails (prints a short diagnostic first).
    """
    if column not in dataframe.columns:
        raise ValueError(
            f"Column {column!r} not found. Available columns: {list(dataframe.columns)[:20]}"
        )

    col = dataframe[column]

    if not pd.api.types.is_numeric_dtype(col):
        msg = (
            f"Column {column!r} must be numeric (int/float), got dtype={col.dtype!r}.\n"
            "Hint: convert it first, e.g. "
            f"dataframe[{column!r}] = pd.to_numeric(dataframe[{column!r}], errors='raise')"
        )
        # print(msg)
        raise ValueError(msg)

    # Use a numeric representation that plays well with pandas nullable dtypes.
    num = pd.to_numeric(col, errors="coerce")

    nan_mask = num.isna()
    inf_mask = np.isinf(num.to_numpy())

    n_total = len(num)
    n_nan = int(nan_mask.sum())
    n_inf = int(inf_mask.sum())

    if n_nan or n_inf:
        examples = []
        if n_nan:
            idx = dataframe.index[nan_mask][:5].tolist()
            examples.append(f"NaN/missing: {n_nan} rows (example indices: {idx})")
        if n_inf:
            idx = dataframe.index[pd.Series(inf_mask, index=dataframe.index)][:5].tolist()
            examples.append(f"Inf: {n_inf} rows (example indices: {idx})")

        msg = (
            f"Validation failed for column {column!r}.\n"
            f"Rows checked: {n_total}\n"
            + "\n".join(f"- {line}" for line in examples)
            + "\nHints:\n"
              "- Check your preprocessing steps that drop/replace NaN/inf.\n"
        )
        # print(msg)
        raise ValueError(msg)

    return True


In [3]:
def run_validate_data_smoke_tests() -> None:
    # Pass: clean numeric column (zeros allowed)
    df = pd.DataFrame({"metric": [0, 1, 2.5, 3]})
    assert validate_data(df, "metric") is True

    # Fail: missing column
    try:
        validate_data(pd.DataFrame({"x": [1, 2, 3]}), "metric")
        assert False, "Expected ValueError for missing column"
    except ValueError:
        pass

    # Fail: non-numeric dtype
    try:
        validate_data(pd.DataFrame({"metric": ["1", "2", "3"]}), "metric")
        assert False, "Expected ValueError for non-numeric dtype"
    except ValueError:
        pass

    # Fail: NaN
    try:
        validate_data(pd.DataFrame({"metric": [1.0, np.nan, 2.0]}), "metric")
        assert False, "Expected ValueError for NaN values"
    except ValueError:
        pass

    # Fail: +inf
    try:
        validate_data(pd.DataFrame({"metric": [1.0, np.inf, 2.0]}), "metric")
        assert False, "Expected ValueError for +inf values"
    except ValueError:
        pass

    # Fail: -inf
    try:
        validate_data(pd.DataFrame({"metric": [1.0, -np.inf, 2.0]}), "metric")
        assert False, "Expected ValueError for -inf values"
    except ValueError:
        pass

    # Pass: pandas nullable int dtype (zeros allowed)
    df = pd.DataFrame({"metric": pd.Series([0, 1, 2, 3], dtype="Int64")})
    assert validate_data(df, "metric") is True

    # Fail: nullable int with NA
    try:
        df = pd.DataFrame({"metric": pd.Series([1, None, 3], dtype="Int64")})
        validate_data(df, "metric")
        assert False, "Expected ValueError for NA in nullable int column"
    except ValueError:
        pass

    print("✅ validate_data smoke tests passed")


run_validate_data_smoke_tests()


✅ validate_data smoke tests passed


##### Transformation

In [4]:
def log1p(column: pd.DataFrame):
    return np.log1p(column.astype(float))

##### Tests

In [None]:
NR_OF_TESTS = 12
BONFERRONI_ALPHA  = 0.05 / NR_OF_TESTS  # 0.00416

In [33]:
def run_mannwhitneyu(
    df: pd.DataFrame,
    metric: str,
    *,
    term_col: str = "term",
    without_terms: tuple[int, ...] = (1, 2),
    with_terms: tuple[int, ...] = (3, 4),
    alternative: Literal["two-sided", "less", "greater"] = "two-sided",
    apply_outlier_filter: bool = True,
) -> stats.MannwhitneyuResult:
    """
    Run a Mann–Whitney U test for `metric` comparing:
      - without GenAI: term in {1,2}
      - with GenAI: term in {3,4}

    Optionally filters outliers (row-filtering) before the test.

    Args:
        df: Input DataFrame.
        metric: Numeric column to test.
        term_col: Column holding term labels (default "term").
        without_terms: Terms considered "without GenAI".
        with_terms: Terms considered "with GenAI".
        alternative: "two-sided", "less", or "greater".
        apply_outlier_filter: If True, filters outliers using `filter_outliers`.

    Returns:
        The scipy MannwhitneyuResult.

    Raises:
        ValueError: If validation fails or groups have insufficient data.
    """
    if term_col not in df.columns:
        raise ValueError(f"Column {term_col!r} not found in the DataFrame.")
    if metric not in df.columns:
        raise ValueError(f"Column {metric!r} not found in the DataFrame.")

    validate_data(df, metric)

    work_df = df.copy()

    if apply_outlier_filter:
        work_df = filter_outliers(work_df, metric)

    without = work_df.loc[work_df[term_col].isin(without_terms), metric].dropna()
    with_genai = work_df.loc[work_df[term_col].isin(with_terms), metric].dropna()

    if without.size < 2 or with_genai.size < 2:
        raise ValueError(
            f"Not enough data after filtering to run Mann–Whitney U for {metric!r}. "
            f"n_without={without.size}, n_with={with_genai.size}"
        )

    res = stats.mannwhitneyu(without, with_genai, alternative=alternative)

    significance = "a" if res.pvalue < BONFERRONI_ALPHA else "no"

    output_text = f"A {alternative} Mann–Whitney U test comparing the without-GenAI (n={without.size}) and with-GenAI (n={with_genai.size}) groups showed {significance} statistically significant difference in {metric} (U={res.statistic:.6g}, p={res.pvalue:.6g})."
    print(
        output_text
    )
    return res


In [34]:
df = pd.read_csv(cfg.project_level_dataset_path_normalized)

run_mannwhitneyu(df=df, metric='lines_of_code_tslike', apply_outlier_filter=False)

A two-sided Mann–Whitney U test comparing the without-GenAI (n=300) and with-GenAI (n=381) groups showed a statistically significant difference in lines_of_code_tslike (U=28124.5, p=4.79774e-30).


MannwhitneyuResult(statistic=np.float64(28124.5), pvalue=np.float64(4.797737853617924e-30))

In [49]:
def run_mixedlm_genai(
    df: pd.DataFrame,
    metric: str,
    *,
    repo_col: str = "project",
    genai_col: str = "genai_period",
    apply_outlier_filter: bool = True,
    log_transform: bool = True,
    fit_method: str = "lbfgs",
    reml: bool = False,
) -> MixedLMResults:
    """
    Fit a linear mixed-effects model with a random intercept per project/repo.

    Compares GenAI period:
      - without GenAI: term in {1,2} -> genai_period=False
      - with GenAI: term in {3,4} -> genai_period=True

    Model (default):
        log1p(metric) ~ C(genai_period) + (1 | repo_col)

    Args:
        df: Input DataFrame.
        metric: Numeric column to model.
        repo_col: Grouping column for random intercepts (e.g., project/repo).
        genai_col: GenAI period column.
        apply_outlier_filter: If True, filters outliers using `filter_outliers`.
        log_transform: If True, models log1p(metric) to handle skew/zeros.
        fit_method: Optimizer method passed to statsmodels (e.g. "lbfgs").
        reml: If False, fits ML; if True, fits REML.

    Returns:
        The fitted statsmodels MixedLMResults.

    Raises:
        ValueError: If required columns are missing, validation fails, or insufficient data.
    """
    for col in (metric, repo_col, genai_col):
        if col not in df.columns:
            raise ValueError(f"Column {col!r} not found in the DataFrame.")

    validate_data(df, metric)

    work_df = df[[repo_col, genai_col, metric]].copy()

    if apply_outlier_filter:
        work_df = filter_outliers(work_df, metric)

    work_df = work_df.dropna(subset=[repo_col, genai_col, metric]).copy()

    if log_transform:
        if (work_df[metric] < -1).any():
            raise ValueError(
                f"{metric!r} contains values < -1; cannot apply log1p safely."
            )
        y_col = f"log_{metric}"
        work_df[y_col] = log1p(column=work_df[metric])
    else:
        y_col = metric

    n_total = len(work_df)
    n_without = int((work_df[genai_col] == False).sum())
    n_with = int((work_df[genai_col] == True).sum())
    n_groups = int(work_df[repo_col].nunique())

    if n_without < 5 or n_with < 5 or n_groups < 2:
        raise ValueError(
            "Not enough data to fit MixedLM after filtering.\n"
            f"rows={n_total}, n_without={n_without}, n_with={n_with}, groups={n_groups}"
        )

    formula = f"{y_col} ~ C({genai_col})"
    model = smf.mixedlm(formula, data=work_df, groups=work_df[repo_col])
    fit = model.fit(reml=reml, method=fit_method)

    print(fit.summary())

    # Fixed effect: True vs False
    param_name = f"C({genai_col})[T.True]"
    if param_name not in fit.params.index:
        candidates = [p for p in fit.params.index if "T.True" in p and genai_col in p]
        raise ValueError(
            f"Could not find fixed effect parameter {param_name!r}. "
            f"Available params: {list(fit.params.index)} "
            f"(candidates: {candidates})"
        )

    beta = float(fit.params[param_name])
    se = float(fit.bse[param_name])
    p = float(fit.pvalues[param_name])

    ci = fit.conf_int().loc[param_name]
    ci_low, ci_high = float(ci[0]), float(ci[1])

    # print("\nFixed effect: GenAI Period True vs False")
    # print(f"rows={n_total}, groups={n_groups}, n_without={n_without}, n_with={n_with}")
    # print(f"beta={beta:.6g}, SE={se:.6g}, p={p:.6g}")
    # print(f"95% CI (beta): [{ci_low:.6g}, {ci_high:.6g}]")

    if log_transform:
        pct_change = (np.exp(beta) - 1) * 100
        pct_ci_low = (np.exp(ci_low) - 1) * 100
        pct_ci_high = (np.exp(ci_high) - 1) * 100
        print(
            f"Approx % change in {metric!r} (log1p scale): {pct_change:.3g}% "
            f"(95% CI [{pct_ci_low:.3g}%, {pct_ci_high:.3g}%])"
        )
        # pct_text = (
        #     f" This corresponds to an estimated {pct_change:.2f}% change in {metric!r} "
        #     f"(95% CI [{pct_ci_low:.2f}%, {pct_ci_high:.2f}%])."
        # )
        pct_text = (
            f" Back-transformed, this corresponds to an estimated {pct_change:+.2f}% change in {metric!r} "
            f"(95% CI [{pct_ci_low:+.2f}%, {pct_ci_high:+.2f}%])."
        )

    # text = (
    #     "We fitted a linear mixed-effects model with a random intercept for project "
    #     f"({n_groups} projects; N={n_total} commits) to test whether GenAI access "
    #     f"(terms 3–4; n={n_with}) is associated with differences in {metric!r} compared to "
    #     f"no GenAI access (terms 1–2; n={n_without}). "
    #     f"On the {'log1p' if log_transform else 'raw'} scale, the GenAI period effect was "
    #     f"β={beta:.4f} (SE={se:.4f}, p={p:.3g}; 95% CI [{ci_low:.4f}, {ci_high:.4f}])."
    #     f"{pct_text}"
    # )





    # Build the narrative with parameterized pieces
    effect = (
        "had no effect" if p >= BONFERRONI_ALPHA else
        ("effect was negative" if beta < 0 else "effect was positive")
    )
    significance = "statistically significant" if p < BONFERRONI_ALPHA else "not statistically significant"

    print("------------------------------------------------------")
    print(
        f"A linear mixed-effects model with a random intercept for project was estimated "
        f"({n_groups} projects; N = {n_total:,} commits) to assess whether GenAI access "
        f"(terms 3–4; n = {n_with:,}) is associated with differences in {metric!r} relative to "
        f"no GenAI access (terms 1–2; n = {n_without:,}). "
        f"On the {'log1p' if log_transform else 'raw'} scale, the GenAI period {effect} and was "
        f"{significance} (β = {beta:+.4f}, SE = {se:.4f}, p = {p:.3g}; 95% CI [{ci_low:+.4f}, {ci_high:+.4f}])."
        f"{pct_text if log_transform else ""}"
    )

    print("------------------------------------------------------")
    # print(text)
    return fit


In [50]:
df = pd.read_csv(cfg.commit_level_dataset_path_normalized)

run_mixedlm_genai(df=df, metric='deletions')



               Mixed Linear Model Regression Results
Model:                MixedLM   Dependent Variable:   log_deletions
No. Observations:     82201     Method:               ML           
No. Groups:           681       Scale:                2.6758       
Min. group size:      1         Log-Likelihood:       -157589.8860 
Max. group size:      341       Converged:            Yes          
Mean group size:      120.7                                        
-------------------------------------------------------------------
                        Coef.  Std.Err.    z    P>|z| [0.025 0.975]
-------------------------------------------------------------------
Intercept                2.535    0.020 129.726 0.000  2.497  2.573
C(genai_period)[T.True] -0.021    0.026  -0.836 0.403 -0.072  0.029
Group Var                0.081    0.004                            

Approx % change in 'deletions' (log1p scale): -2.12% (95% CI [-6.92%, 2.93%])
----------------------------------------------------

<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1fc07c2f080>

Thesis-ready result (Mixed-effects model; random intercept per project):

We fitted a linear mixed-effects model with a random intercept for project (681 projects; N=82,201 commits) to test whether GenAI access (terms 3–4) is associated with differences in deletions compared to no GenAI access (terms 1–2). On the log1p scale, the GenAI period effect was not statistically significant (β = -0.0215, SE = 0.0257, p = 0.403; 95% CI [-0.0717, 0.0288]). This corresponds to an estimated -2.12% change in deletions (95% CI [-6.92%, 2.93%]).

## Pipeline

#### Commit based easy ones

In [51]:

metrics = cfg.commit_cols

for col in metrics:
       df = pd.read_csv(cfg.commit_level_dataset_path_cleaned)
       df = df[~df[col].isna()]
       # print(f"Datapoints left: {df.shape[0]}")
       run_mixedlm_genai(df=df, metric=col)





               Mixed Linear Model Regression Results
Model:              MixedLM   Dependent Variable:   log_total_lines
No. Observations:   53206     Method:               ML             
No. Groups:         669       Scale:                2.9030         
Min. group size:    3         Log-Likelihood:       -104285.5147   
Max. group size:    258       Converged:            Yes            
Mean group size:    79.5                                           
-------------------------------------------------------------------
                        Coef.  Std.Err.    z    P>|z| [0.025 0.975]
-------------------------------------------------------------------
Intercept                4.035    0.023 173.234 0.000  3.990  4.081
C(genai_period)[T.True] -0.090    0.031  -2.948 0.003 -0.150 -0.030
Group Var                0.109    0.005                            

Approx % change in 'total_lines' (log1p scale): -8.64% (95% CI [-14%, -2.98%])
---------------------------------------------------



                   Mixed Linear Model Regression Results
Model:                MixedLM     Dependent Variable:     log_unit_size_dmm
No. Observations:     41267       Method:                 ML               
No. Groups:           669         Scale:                  0.0811           
Min. group size:      3           Log-Likelihood:         inf              
Max. group size:      200         Converged:              Yes              
Mean group size:      61.7                                                 
---------------------------------------------------------------------------
                        Coef.  Std.Err.    z   P>|z|    [0.025     0.975]  
---------------------------------------------------------------------------
Intercept               0.123 146616.315 0.000 1.000 -287362.574 287362.819
C(genai_period)[T.True] 0.097 206729.242 0.000 1.000 -405181.772 405181.967
Group Var               0.000                                              

Approx % change in 'unit_size_

  pct_ci_high = (np.exp(ci_high) - 1) * 100
  pct_ci_high = (np.exp(ci_high) - 1) * 100


                   Mixed Linear Model Regression Results
Model:                 MixedLM    Dependent Variable:    log_complexity_dmm
No. Observations:      41196      Method:                ML                
No. Groups:            669        Scale:                 0.0852            
Min. group size:       3          Log-Likelihood:        inf               
Max. group size:       200        Converged:             Yes               
Mean group size:       61.6                                                
---------------------------------------------------------------------------
                        Coef.  Std.Err.    z   P>|z|    [0.025     0.975]  
---------------------------------------------------------------------------
Intercept               0.113 133068.916 0.000 1.000 -260810.169 260810.396
C(genai_period)[T.True] 0.268 204294.464 0.000 1.000 -400409.523 400410.058
Group Var               0.000                                              

Approx % change in 'complexity



                   Mixed Linear Model Regression Results
Model:                MixedLM     Dependent Variable:     log_interface_dmm
No. Observations:     41163       Method:                 ML               
No. Groups:           669         Scale:                  0.0826           
Min. group size:      3           Log-Likelihood:         inf              
Max. group size:      200         Converged:              Yes              
Mean group size:      61.5                                                 
---------------------------------------------------------------------------
                        Coef.  Std.Err.    z   P>|z|    [0.025     0.975]  
---------------------------------------------------------------------------
Intercept               0.085 130775.412 0.000 1.000 -256315.012 256315.182
C(genai_period)[T.True] 0.115 275098.695 0.000 1.000 -539183.420 539183.649
Group Var               0.000                                              

Approx % change in 'interface_

  pct_ci_high = (np.exp(ci_high) - 1) * 100


In [10]:
df = pd.read_csv(cfg.commit_level_dataset_path_cleaned)

cols = cfg.commit_cols
df.shape

(57372, 20)

#### Project based ones

In [35]:
df = pd.read_csv(cfg.project_level_dataset_path_cleaned)

for col in cfg.project_cols:
    run_mannwhitneyu(df=df, metric=col)

A two-sided Mann–Whitney U test comparing the without-GenAI (n=291) and with-GenAI (n=376) groups showed a statistically significant difference in 3_folders_total_lines (U=31100, p=1.11553e-21).
A two-sided Mann–Whitney U test comparing the without-GenAI (n=291) and with-GenAI (n=374) groups showed a statistically significant difference in 3_folders_sloc (U=33290.5, p=8.26501e-18).
A two-sided Mann–Whitney U test comparing the without-GenAI (n=289) and with-GenAI (n=373) groups showed no statistically significant difference in cc_sum_3folders (U=53331.5, p=0.816436).
A two-sided Mann–Whitney U test comparing the without-GenAI (n=292) and with-GenAI (n=377) groups showed no statistically significant difference in comment_density (U=55256, p=0.931373).


#### Dev based ones

In [59]:
df = pd.read_csv(cfg.dev_level_dataset_path_cleaned)
df.columns

Index(['dev', 'project', 'term', 'genai_period', 'commits_per_developer',
       'contribution_per_developer', 'commit_contribution_ratio',
       'relative_code_churn'],
      dtype='object')

In [55]:
df.shape

(1734, 8)

In [60]:
df.dtypes

dev                           float64
project                         int64
term                            int64
genai_period                     bool
commits_per_developer           int64
contribution_per_developer      int64
commit_contribution_ratio     float64
relative_code_churn           float64
dtype: object

In [57]:
df.describe()

Unnamed: 0,dev,project,term,commits_per_developer,contribution,commit_contribution_ratio,relative_code_churn
count,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0,1734.0
mean,0.649366,336.302191,2.624567,28.448674,167700.3,0.337947,0.337947
std,1.246996,196.439257,1.133499,28.447469,1783138.0,0.251344,0.289216
min,-1.0,0.0,1.0,1.0,1.0,0.009524,1.6e-05
25%,-1.0,164.0,2.0,4.0,5375.0,0.0625,0.074065
50%,1.0,337.0,3.0,21.5,12418.0,0.355996,0.25968
75%,2.0,511.0,4.0,43.0,49806.0,0.533065,0.545565
max,2.0,680.0,4.0,230.0,41144530.0,0.970588,0.99992


In [62]:
metrics = cfg.dev_cols

for col in metrics:
       df = pd.read_csv(cfg.dev_level_dataset_path_cleaned)
       run_mannwhitneyu(df=df, metric=col)

A two-sided Mann–Whitney U test comparing the without-GenAI (n=761) and with-GenAI (n=970) groups showed no statistically significant difference in commits_per_developer (U=369422, p=0.973942).
A two-sided Mann–Whitney U test comparing the without-GenAI (n=745) and with-GenAI (n=938) groups showed a statistically significant difference in contribution_per_developer (U=275539, p=8.69682e-14).
A two-sided Mann–Whitney U test comparing the without-GenAI (n=761) and with-GenAI (n=973) groups showed no statistically significant difference in commit_contribution_ratio (U=388201, p=0.0823604).
A two-sided Mann–Whitney U test comparing the without-GenAI (n=761) and with-GenAI (n=973) groups showed no statistically significant difference in relative_code_churn (U=386228, p=0.121993).


## Random

In [None]:
x = df[df['merge_commit'] == True]
x['diff_lines'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: diff_lines, dtype: float64

In [None]:
x['diff_lines'].value_counts().head(10)

Series([], Name: count, dtype: int64)

In [None]:
x[x['total_lines'] < 10]['total_lines'].shape

(0,)

In [None]:
ndf = df[df['merge_commit'] == False]
ndf.shape

(57372, 20)

In [None]:
ndf['commit_hash'].nunique()

57372