In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor

In [2]:
df = pd.read_parquet("full_results.parquet")
df = df[df['Strategy'] != 'No Corpus']
df = df[df['Model'] != 'none']
df = df[df['Corpus Type'] != 'merge']

df['Coverage Delta'] = df.groupby('Function')['Coverage Delta'].transform(lambda x: (x - x.mean()) / x.std())

df = df.dropna(subset=['Coverage Delta'])

In [3]:
result_rows = []

for func in df['Function'].unique():
    print(f"=== {func} ===")
    df_filtered = df[df['Function'] == func]
    print(len(df_filtered))
    
    columns_to_extract = [ 'Temperature', 'Strategy', 'Model', 'Corpus Type', 'Coverage Delta'] # 'Temperature','Strategy', 'Coverage Delta']

    # Extract the columns and put them in a dictionary with lists
    data = {col: df_filtered[col].tolist() for col in columns_to_extract}
    df_filtered = pd.DataFrame(data)

    # Setting up the features and target variable
    X = df_filtered.drop('Coverage Delta', axis=1)
    y = df_filtered['Coverage Delta']

    # Creating a column transformer to handle categorical variables
    column_transformer = ColumnTransformer(
    [
        ('model_ohe', OneHotEncoder(drop=['GPT-3']), ['Model']),  # Keep all categories for 'Model'
        ('strategy_ohe', OneHotEncoder(drop=['LLM Only']), ['Strategy']),  # Drop first for 'Strategy'
        ('corpus_type_ohe', OneHotEncoder(drop=['simple']), ['Corpus Type'])  # Drop first for 'Corpus Type'
    ],
    remainder='passthrough'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', column_transformer),
        ('regressor', LinearRegression())
    ])

    # Creating a dummy regressor as the null model
    null_model = DummyRegressor(strategy='mean')#, constant=0 )

    # Splitting data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)#, random_state=4)

    # Fitting the regression model
    pipeline.fit(X_train, y_train)

    # Accessing and printing coefficients and intercept from the linear regression model
    linear_model = pipeline.named_steps['regressor']
    coefficients = linear_model.coef_
    intercept = linear_model.intercept_

    # Get names of the features after transformation
    feature_names = column_transformer.get_feature_names_out()

    print("Intercept:", intercept)
    print("Coefficients:")
    for name, coeff in zip(feature_names, coefficients):
        print(f"{name}: {coeff}")
    print("\n")

    # Fitting the null model
    null_model.fit(X_train, y_train)

    # Predicting on training and test data using the regression model
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    # Predicting using the null model
    y_null_pred = null_model.predict(X_test)

    # Evaluating the regression model
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Evaluating the null model
    null_mse = mean_squared_error(y_test, y_null_pred)
    null_r2 = r2_score(y_test, y_null_pred)


    print(f"Test MSE for Regression Model: {test_mse}")
    print(f"Test MSE for Null Model: {null_mse}")

    print(f"Test R^2 for Regression Model: {test_r2}")
    print(f"Test R^2 for Null Model: {null_r2}")
    result_rows.append( [ func,
                          test_mse, null_mse, null_mse - test_mse,
                          test_r2,  abs(null_r2),  test_r2 - null_r2
                        ] )

=== PIL.Image.open ===
107438
Intercept: -1.2457272000900845
Coefficients:
model_ohe__Model_Claude-Instant: 0.10119814639712633
model_ohe__Model_Claude-Opus: 0.2062427379370928
model_ohe__Model_GPT-4: 1.8780682429232658
model_ohe__Model_Gemini-1.0: 0.30002433738044687
strategy_ohe__Strategy_LLM + Fuzzing: 0.7465449401614337
corpus_type_ohe__Corpus Type_complex: 0.11078689386997899
remainder__Temperature: -0.2823464125624846


Test MSE for Regression Model: 0.04576207564562818
Test MSE for Null Model: 1.0086519995359753
Test R^2 for Regression Model: 0.9546301496353765
Test R^2 for Null Model: -6.87563430745719e-06
=== ast.literal_eval ===
104422
Intercept: -1.3942411451653383
Coefficients:
model_ohe__Model_Claude-Instant: 1.0398520787981473
model_ohe__Model_Claude-Opus: 1.1932520050819673
model_ohe__Model_GPT-4: 0.3386271658389421
model_ohe__Model_Gemini-1.0: 0.6920233848113272
strategy_ohe__Strategy_LLM + Fuzzing: 1.899136195176422
corpus_type_ohe__Corpus Type_complex: -0.373831248984

In [4]:
df_results = pd.DataFrame(result_rows, columns = ["function", "test_mse", "null_mse", "diff_mse", "test_r2",  "null_r2",  "diff_r2"])

In [5]:
# Create entries for LaTeX table
for index, row in df_results.iterrows():
    fname = row['function'].replace('_', '\_')
    print(f"{fname} & "
          f"{row['test_mse']:.2f} & {row['null_mse']:.2f} & "
          f"{row['test_r2']:.2f} \\\\")

PIL.Image.open & 0.05 & 1.01 & 0.95 \\
ast.literal\_eval & 0.48 & 1.00 & 0.53 \\
cgi.parse\_header & 0.81 & 0.99 & 0.19 \\
cgi.parse\_multipart & 0.77 & 0.99 & 0.23 \\
configparser.read\_string & 0.12 & 1.00 & 0.88 \\
django.core.deserialize & 0.71 & 1.01 & 0.30 \\
email.message\_from\_string & 0.28 & 1.00 & 0.72 \\
email.parser.parsebytes & 0.24 & 0.99 & 0.76 \\
email.parser.Parser.parsestr & 0.24 & 0.99 & 0.76 \\
email.utils.parseaddr & 0.21 & 1.00 & 0.79 \\
email.utils.parsedate & 0.26 & 0.98 & 0.73 \\
exrex.getone & 0.27 & 1.00 & 0.73 \\
fnmatch.filter & 0.53 & 1.00 & 0.48 \\
ftplib.FTP & 0.69 & 0.91 & 0.25 \\
glob.glob & 0.39 & 1.01 & 0.62 \\
html.parser.HTMLParser.feed & 0.43 & 0.99 & 0.57 \\
json.loads & 0.06 & 1.01 & 0.94 \\
paramiko.SSHClient.connect & 0.55 & 1.01 & 0.45 \\
plistlib.dumps & 0.52 & 0.96 & 0.46 \\
requests.get & 0.03 & 1.00 & 0.97 \\
scipy.optimize.minimize & 0.00 & 0.99 & 1.00 \\
shlex.split & 0.31 & 1.01 & 0.70 \\
smtplib.SMTP & 0.17 & 1.02 & 0.83 \\
sunau.ope

In [6]:
averages = df_results[['test_mse', 'null_mse', 'diff_mse', 'test_r2', 'null_r2', 'diff_r2']].mean()

# Print the averages formatted to two decimal places
print("Column Averages:")
for column, value in averages.items():
    print(f"{column}: {value:.2f}")

Column Averages:
test_mse: 0.34
null_mse: 1.00
diff_mse: 0.66
test_r2: 0.66
null_r2: 0.00
diff_r2: 0.66


In [7]:
# Start the LaTeX formula with the intercept
formula = f"y = {intercept:.2f} "

# Add each coefficient and its corresponding feature name
for coeff, name in zip(coefficients, feature_names):
    if coeff >= 0: formula += f" + {coeff:.2f}{name}"
    else: formula += f" - {abs(coeff):.2f}*{name}"
formula

'y = -2.08  + 0.10model_ohe__Model_Claude-Instant + 0.32model_ohe__Model_Claude-Opus + 1.54model_ohe__Model_GPT-4 - 0.06*model_ohe__Model_Gemini-1.0 + 2.01strategy_ohe__Strategy_LLM + Fuzzing + 0.32corpus_type_ohe__Corpus Type_complex + 0.19remainder__Temperature'