In [1]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
FEATURES_LIST_FILE = "data/features"
RAW_DATA_CSV = "data/repro.csv"
BUILD_TAG = "BUILD?"
REPRODUCTIBLE_TAG = "REPRO?"

lines = None
with open(FEATURES_LIST_FILE, 'r') as stream:
    lines = map(lambda x: x.strip(), stream.readlines())

repro = pandas.read_csv(RAW_DATA_CSV)
repro_len = len(repro)
table = dict()

for elt in lines:
    table[elt] = [0 for _ in range(repro_len)]

table[BUILD_TAG] = list(repro["build?"])
table[REPRODUCTIBLE_TAG] = list(repro["reproducible?"])

for i in range(1, repro_len + 1):
    config = f"configs/{i:04d}_randconfig"
    with open(config, 'r') as stream:
        for line in stream:
            if line.startswith('#'):
                continue
            if line.startswith("CONFIG_"):
                name, value = line.strip().split('=')
                name = name[7:]
                table[name][i-1] = value

df = pandas.DataFrame.from_dict(table)

In [None]:
encoding_map = {
    'y': 1,
    'm': 2,
}

# Drop columns with datatype 'object'
df = df.replace(encoding_map)

constant_columns = df.columns[df.nunique() == 1]
df.drop(constant_columns, axis=1, inplace=True)


In [None]:
df = df.drop(df.select_dtypes(include=['object']).columns, axis=1)

In [None]:
df = df.query("`BUILD?` == True")

In [None]:
# Drop the 'Unnamed: 0' column
# df.drop(columns=['Unnamed: 0'], inplace=True)

# Separate the target variable and features
X = df.drop(columns=['REPRO?'])
y = df['REPRO?']

# One-hot encode the features
# encoder = OneHotEncoder(drop='first', sparse=False)
# X_encoded = encoder.fit_transform(X)

# Apply the mapping to the dataframe, filling NaN with 0
X_encoded = X.replace(encoding_map).fillna(0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


In [None]:
df.drop(columns=['REPRO?'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy


In [None]:
from sklearn.tree import export_graphviz
import graphviz

ft_names = X_train.columns.tolist()

# Export the decision tree to DOT format
dot_data = export_graphviz(clf, out_file=None, 
                           feature_names=ft_names, 
                           class_names=['Not Repro', 'Repro'], 
                           filled=True, rounded=True, 
                           special_characters=True)

# Render and save the visualization using graphviz
graph = graphviz.Source(dot_data)
graph.render("decision_tree_repro", format="png")


In [None]:
graph