In [1]:
import pandas

In [2]:
lines = None
with open("data/features", 'r') as stream:
    lines = map(lambda x: x.strip(), stream.readlines())

In [3]:
repro = pandas.read_csv("data/repro.csv")

In [4]:
table = dict()
for elt in lines: 
    table[elt] = [None for _ in range(1000)]
table["BUILD?"] = list(repro["build?"])
table["REPRO?"] = list(repro["reproducible?"])

In [5]:
for i in range(1, 1001):
    config = f"configs/{i:04d}_randconfig"
    with open(config, 'r') as stream:
        for line in stream:
            if line.startswith('#'):
                continue
            if line.startswith("CONFIG_"):
                name, value = line.strip().split('=')
                name = name[7:]
                table[name][i-1] = value

In [6]:
df = pandas.DataFrame.from_dict(table)

In [7]:
df

Unnamed: 0,104_QUAD_8,60XX_WDT,64BIT,6LOWPAN,6LOWPAN_DEBUGFS,6LOWPAN_GHC_EXT_HDR_DEST,6LOWPAN_GHC_EXT_HDR_FRAG,6LOWPAN_GHC_EXT_HDR_HOP,6LOWPAN_GHC_EXT_HDR_ROUTE,6LOWPAN_GHC_ICMPV6,...,ZSTD_COMPRESS,ZSTD_DECOMPRESS,ZSWAP,ZSWAP_COMPRESSOR_DEFAULT,ZSWAP_COMPRESSOR_DEFAULT_LZO,ZSWAP_DEFAULT_ON,ZSWAP_ZPOOL_DEFAULT,ZSWAP_ZPOOL_DEFAULT_ZBUD,BUILD?,REPRO?
0,,,y,y,,,,,,,...,y,y,,,,,,,True,True
1,,y,y,,,,,,,,...,,m,,,,,,,True,True
2,,y,y,,,,,,,,...,y,y,,,,,,,True,False
3,,y,y,,,,,,,,...,y,y,,,,,,,True,True
4,,,y,,,,,,,,...,,y,,,,,,,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,y,,,,,,,,...,m,y,,,,,,,True,False
996,,y,y,,,,,,,,...,y,y,,,,,,,True,True
997,,,y,,,,,,,,...,y,y,,,,,,,True,True
998,,,y,,,,,,,,...,y,y,,,,,,,True,True


In [8]:
constant_columns = df.columns[df.nunique() == 1]
constant_columns

Index(['64BIT', '6LOWPAN_DEBUGFS', '8139_OLD_RX_RESET', '8139TOO_8129',
       '8139TOO_PIO', '8139TOO_TUNE_TWISTER', '88EU_AP_MODE', '9P_FSCACHE',
       '9P_FS_POSIX_ACL', '9P_FS_SECURITY',
       ...
       'ZRAM_DEF_COMP_ZSTD', 'ZRAM_MEMORY_TRACKING', 'ZRAM_WRITEBACK',
       'ZSMALLOC_STAT', 'ZSWAP', 'ZSWAP_COMPRESSOR_DEFAULT',
       'ZSWAP_COMPRESSOR_DEFAULT_LZO', 'ZSWAP_DEFAULT_ON',
       'ZSWAP_ZPOOL_DEFAULT', 'ZSWAP_ZPOOL_DEFAULT_ZBUD'],
      dtype='object', length=4483)

In [9]:
df.drop(constant_columns, axis=1, inplace=True)
df

Unnamed: 0,104_QUAD_8,60XX_WDT,6LOWPAN,6LOWPAN_GHC_EXT_HDR_DEST,6LOWPAN_GHC_EXT_HDR_FRAG,6LOWPAN_GHC_EXT_HDR_HOP,6LOWPAN_GHC_EXT_HDR_ROUTE,6LOWPAN_GHC_ICMPV6,6LOWPAN_GHC_UDP,6LOWPAN_NHC,...,ZPA2326_I2C,ZPA2326_SPI,ZPOOL,ZRAM,ZRAM_DEF_COMP,ZSMALLOC,ZSTD_COMPRESS,ZSTD_DECOMPRESS,BUILD?,REPRO?
0,,,y,,,,,,,,...,,,,,,y,y,y,True,True
1,,y,,,,,,,,,...,,,m,,,,,m,True,True
2,,y,,,,,,,,,...,y,,y,,,y,y,y,True,False
3,,y,,,,,,,,,...,y,y,y,,,,y,y,True,True
4,,,,,,,,,,,...,y,,y,,,,,y,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,,,,...,,,y,m,"""lzo-rle""",y,m,y,True,False
996,,y,,,,,,,,,...,y,y,y,,,y,y,y,True,True
997,,,,,,,,,,,...,y,,,,,y,y,y,True,True
998,,,,,,,,,,,...,m,,m,,,m,y,y,True,True


In [10]:
encoding_map = {
    'y': 1,
    'm': 2,
    None : 0
}



In [11]:
# Drop columns with datatype 'object'
df = df.replace(encoding_map)
df = df.drop(df.select_dtypes(include=['object']).columns, axis=1)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Drop the 'Unnamed: 0' column
# df.drop(columns=['Unnamed: 0'], inplace=True)

# Separate the target variable and features
X = df.drop(columns=['REPRO?'])
y = df['REPRO?']

# One-hot encode the features
# encoder = OneHotEncoder(drop='first', sparse=False)
# X_encoded = encoder.fit_transform(X)

# Define the mapping
encoding_map = {
    'y': 1,
    'm': 2
}

# Apply the mapping to the dataframe, filling NaN with 0
X_encoded = X.replace(encoding_map).fillna(0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


((800, 8189), (200, 8189))

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.645

In [14]:
from sklearn.tree import export_graphviz
import graphviz

ft_names = X_train.columns.tolist()

# Export the decision tree to DOT format
dot_data = export_graphviz(clf, out_file=None, 
                           feature_names=ft_names, 
                           class_names=['Not Repro', 'Repro'], 
                           filled=True, rounded=True, 
                           special_characters=True)

# Render and save the visualization using graphviz
graph = graphviz.Source(dot_data)
graph.render("decision_tree_repro", format="png")


'decision_tree_repro.png'