In [1]:
%config Completer.use_jedi = False
# fixes firefox tab completion

In [2]:
# Importing the required libraries
import numpy as np
import pandas as pd
from math import log2, sqrt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.fft import fft, ifft
from scipy.special import erfc

In [3]:
# Concatenate data
def concatenateData(df, num_concats):
    if num_concats == 0: # do nothing
        return df.rename(columns={'binary_number': 'Concatenated_Data'})
    new_df = pd.DataFrame({
        'Concatenated_Data': [''] * (len(df) // num_concats), 
        'label': [0] * (len(df) // num_concats)
    })

    # Loop through each group of num_concats rows and concatenate their 'binary_number' strings
    for i in range(0, len(df), num_concats):
        new_df.iloc[i // num_concats, 0] = ''.join(df['binary_number'][i:i + num_concats])
        new_df.iloc[i // num_concats, 1] = df['label'][i]

    return new_df

# Calculate Shannon entropy for each concatenated binary sequence
def shannon_entropy(binary_string):
    if len(binary_string) % 2 != 0:
        raise ValueError("Binary string length must be a multiple of 2.")
    
    patterns = ['00', '10', '11', '01']
    frequency = {pattern: 0 for pattern in patterns}
    
    for i in range(0, len(binary_string), 2):
        segment = binary_string[i:i+2]
        if segment in patterns:
            frequency[segment] += 1
    
    total_segments = sum(frequency.values())
    
    entropy = 0
    for count in frequency.values():
        if count > 0:
            probability = count / total_segments
            entropy -= probability * log2(probability)
    
    return entropy


def classic_spectral_test(bit_string):
    bit_array = 2 * np.array([int(bit) for bit in bit_string]) - 1
    dft = fft(bit_array)
    n_half = len(bit_string) // 2 + 1
    mod_dft = np.abs(dft[:n_half])
    threshold = np.sqrt(np.log(1 / 0.05) / len(bit_string))
    peaks_below_threshold = np.sum(mod_dft < threshold)
    expected_peaks = 0.95 * n_half
    d = (peaks_below_threshold - expected_peaks) / np.sqrt(len(bit_string) * 0.95 * 0.05)
    p_value = erfc(np.abs(d) / np.sqrt(2)) / 2
    return d

def frequency_test(bit_string):
    n = len(bit_string)
    count_ones = bit_string.count('1')
    count_zeros = bit_string.count('0')
    
    # The test statistic
    s = (count_ones - count_zeros) / sqrt(n)
    
    # The p-value
    p_value = erfc(abs(s) / sqrt(2))
    
    return p_value

def runs_test(bit_string):
    n = len(bit_string)
    runs = 1  # Start with the first run
    for i in range(1, n):
        if bit_string[i] != bit_string[i - 1]:
            runs += 1
    
    n0 = bit_string.count('0')
    n1 = bit_string.count('1')
    
    # Expected number of runs
    expected_runs = (2 * n0 * n1 / n) + 1
    variance_runs = (2 * n0 * n1 * (2 * n0 * n1 - n)) / (n ** 2 * (n - 1))
    
    # The test statistic
    z = (runs - expected_runs) / sqrt(variance_runs)
    
    # The p-value
    p_value = erfc(abs(z) / sqrt(2))
    
    return p_value

def linear_complexity(bit_string, M=500):
    # Perform linear complexity test with block size M
    n = len(bit_string)
    bit_array = np.array([int(bit) for bit in bit_string])
    lc = 0  # Initialize linear complexity
    
    # Process blocks of size M
    for i in range(0, n, M):
        block = bit_array[i:i+M]
        if len(block) < M:
            continue
        
        lc_block = 0
        for j in range(M):
            if block[j] == 1:
                lc_block = j + 1
        
        lc += lc_block
    
    lc = lc / (n / M)
    return lc

def autocorrelation_test(bit_string, lag=1):
    n = len(bit_string)
    bit_array = np.array([int(bit) for bit in bit_string])
    autocorrelation = np.correlate(bit_array, np.roll(bit_array, lag), mode='valid')[0]
    return autocorrelation / n

def maurer_universal_test(bit_string):
    k = 6
    l = 5
    q = 20
    bit_array = np.array([int(bit) for bit in bit_string])
    max_val = 2**k
    init_subseq = bit_array[:q]
    rest_subseq = bit_array[q:]
    d = {}
    for i in range(len(init_subseq) - k + 1):
        d[tuple(init_subseq[i:i+k])] = i
    t = []
    for i in range(len(rest_subseq) - k + 1):
        subseq = tuple(rest_subseq[i:i+k])
        if subseq in d:
            t.append(i - d[subseq])
            d[subseq] = i
    if not t:
        return 0
    t = np.array(t)
    log_avg = np.mean(np.log2(t))
    return log_avg - np.log2(q)

def binary_matrix_rank_test(bit_string, M=32, Q=32):
    bit_array = np.array([int(bit) for bit in bit_string])
    num_matrices = len(bit_array) // (M * Q)
    ranks = []
    for i in range(num_matrices):
        matrix = bit_array[i*M*Q:(i+1)*M*Q].reshape((M, Q))
        rank = np.linalg.matrix_rank(matrix)
        ranks.append(rank)
    return np.mean(ranks)

def cumulative_sums_test(bit_string):
    bit_array = np.array([int(bit) for bit in bit_string])
    adjusted = 2 * bit_array - 1
    cumulative_sum = np.cumsum(adjusted)
    max_excursion = np.max(np.abs(cumulative_sum))
    return max_excursion

def longest_run_ones_test(bit_string, block_size=100):
    bit_array = np.array([int(bit) for bit in bit_string])
    num_blocks = len(bit_array) // block_size
    max_runs = []
    for i in range(num_blocks):
        block = bit_array[i*block_size:(i+1)*block_size]
        max_run = max([len(list(g)) for k, g in itertools.groupby(block) if k == 1])
        max_runs.append(max_run)
    return np.mean(max_runs)

def random_excursions_test(bit_string):
    bit_array = np.array([int(bit) for bit in bit_string])
    bit_array = 2 * bit_array - 1  # Convert to ±1

    cumulative_sum = np.cumsum(bit_array)
    states = np.unique(cumulative_sum)

    if 0 not in states:
        states = np.append(states, 0)
    state_counts = {state: 0 for state in states}
    for state in cumulative_sum:
        state_counts[state] += 1

    state_counts[0] -= 1  # Adjust for zero state
    pi = [0.5 * (1 - (1 / (2 * state + 1)**2)) for state in states]
    x = np.sum([(state_counts[state] - len(bit_string) * pi[i])**2 / (len(bit_string) * pi[i]) for i, state in enumerate(states)])

    return x


def unique_subsequences(bit_string, length=4):
    bit_array = np.array([int(bit) for bit in bit_string])
    n = len(bit_array)
    subsequences = set()
    
    for i in range(n - length + 1):
        subseq = tuple(bit_array[i:i+length])
        subsequences.add(subseq)
    
    return len(subsequences)

def sample_entropy(bit_string, m=2, r=0.2):
    bit_array = np.array([int(bit) for bit in bit_string])
    N = len(bit_array)
    
    def _phi(m):
        x = np.array([bit_array[i:i+m] for i in range(N - m + 1)])
        C = np.sum(np.all(np.abs(x[:, None] - x) <= r, axis=2), axis=0) / (N - m + 1.0)
        return np.sum(C) / (N - m + 1.0)
    
    return -np.log(_phi(m + 1) / _phi(m))

def permutation_entropy(bit_string, order=3):
    bit_array = np.array([int(bit) for bit in bit_string])
    n = len(bit_array)
    
    permutations = np.array(list(itertools.permutations(range(order))))
    c = np.zeros(len(permutations))
    
    for i in range(n - order + 1):
        sorted_index_array = tuple(np.argsort(bit_array[i:i+order]))
        for j, p in enumerate(permutations):
            if np.array_equal(p, sorted_index_array):
                c[j] += 1
    
    c = c / (n - order + 1)
    pe = -np.sum(c * np.log2(c + np.finfo(float).eps))
    return pe

def lyapunov_exponent(bit_string, m=2, t=1):
    bit_array = np.array([int(bit) for bit in bit_string])
    N = len(bit_array)
    
    def _phi(m):
        x = np.array([bit_array[i:i+m] for i in range(N - m + 1)])
        C = np.sum(np.all(np.abs(x[:, None] - x) <= t, axis=2), axis=0) / (N - m + 1.0)
        return np.sum(np.log(C + np.finfo(float).eps)) / (N - m + 1.0)
    
    return abs(_phi(m) - _phi(m + 1))

def entropy_rate(bit_string, k=2):
    bit_array = np.array([int(bit) for bit in bit_string])
    n = len(bit_array)
    prob = {}
    
    for i in range(n - k + 1):
        subseq = tuple(bit_array[i:i + k])
        if subseq in prob:
            prob[subseq] += 1
        else:
            prob[subseq] = 1
    
    for key in prob:
        prob[key] /= (n - k + 1)
    
    entropy_rate = -sum(p * log2(p) for p in prob.values())
    return entropy_rate

# Apply randomness tests
def apply_randomness_tests(df, tests):
    if not tests:
        raise ValueError("No randomness tests specified.")

    test_functions = {
        'autocorrelation': autocorrelation_test,
        'cumulative_sums': cumulative_sums_test,
        'spectral_test': classic_spectral_test,
        'frequency_test': frequency_test,
        'runs_test': runs_test,
        'shannon_entropy': shannon_entropy
    }

    for test in tests:
        if test not in test_functions:
            raise ValueError(f"Invalid randomness test: {test}")
        df[test] = df['Concatenated_Data'].apply(test_functions[test])

    return df

# Preprocess data
def preprocess_data(df, num_concats, tests):
    df = concatenateData(df, num_concats)
    processed_df = apply_randomness_tests(df, tests)
    
    # Convert concatenated binary strings into separate columns
    df_features = pd.DataFrame(processed_df['Concatenated_Data'].apply(list).tolist()).astype(int).astype(bool)
    processed_df = pd.concat([processed_df.drop(columns='Concatenated_Data'), df_features], axis=1)

    return processed_df

# Calculate min-entropy
def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy



In [4]:
# Main
file_path = 'AI_2qubits_training_data.txt'

# Read the data from the file
data = []
with open(file_path, 'r') as file:
    for line in file:
        if line.strip():
            binary_number, label = line.strip().split()
            data.append((binary_number, int(label)))

# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['binary_number', 'label'])

tests_to_apply = ['spectral_test', 'shannon_entropy', 'frequency_test', 'runs_test', 'autocorrelation']

# Preprocess data and apply randomness tests
preprocessed_df = preprocess_data(df, num_concats=0, tests=tests_to_apply)

In [5]:
preprocessed_df[preprocessed_df.select_dtypes(np.float64).columns] = preprocessed_df.select_dtypes(np.float64).astype(np.float32)

In [6]:
preprocessed_df

Unnamed: 0,label,spectral_test,shannon_entropy,frequency_test,runs_test,autocorrelation,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,1,-21.771553,1.935451,0.423711,0.120217,0.33,False,True,False,False,...,True,True,True,True,True,True,False,False,True,False
1,1,-22.230385,1.963615,0.841481,0.027240,0.31,False,True,True,False,...,False,True,True,False,False,False,True,True,False,True
2,1,-22.230385,1.939471,0.109599,0.498506,0.32,True,True,True,False,...,False,True,True,False,False,False,False,False,True,True
3,1,-22.230385,1.872164,0.071861,0.620874,0.36,True,True,False,True,...,True,True,False,True,True,True,True,True,False,True
4,1,-22.230385,1.976281,0.230139,0.725698,0.18,False,False,False,False,...,False,False,True,True,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,4,-22.230385,1.942653,0.689157,0.556584,0.24,True,True,True,True,...,False,False,True,True,False,False,True,True,False,False
13996,4,-22.230385,1.919479,0.689157,0.987149,0.23,False,True,False,True,...,False,True,False,False,False,False,True,True,False,False
13997,4,-22.230385,1.862236,0.317311,0.011137,0.36,True,True,False,False,...,False,False,False,False,True,True,True,False,False,False
13998,4,-22.230385,1.856367,0.841481,0.548989,0.25,True,True,False,True,...,True,True,False,False,True,True,False,True,False,False


Train/test splits should not be shuffled since the order that the bits were sampled matters. However, we need to take splits from each of the quantum computers, so the splits really should be interleaved. 

Later we'll use k-fold CV to get a better result.

In [7]:
# Split the data into features (X) and labels (y)
X = preprocessed_df.drop(columns='label').values
y = preprocessed_df['label'].values.astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate min-entropy for each sequence in the training and testing datasets
# min-entropy also takes into account the bitstring! is this ok?
min_entropy_train = np.apply_along_axis(calculate_min_entropy, 1, X_train)
min_entropy_test = np.apply_along_axis(calculate_min_entropy, 1, X_test)

# Add min-entropy as a feature
X_train = np.column_stack((X_train, min_entropy_train))
X_test = np.column_stack((X_test, min_entropy_test))

print(preprocessed_df)
print(X_train)
print(X_test)

       label  spectral_test  shannon_entropy  frequency_test  runs_test  \
0          1     -21.771553         1.935451        0.423711   0.120217   
1          1     -22.230385         1.963615        0.841481   0.027240   
2          1     -22.230385         1.939471        0.109599   0.498506   
3          1     -22.230385         1.872164        0.071861   0.620874   
4          1     -22.230385         1.976281        0.230139   0.725698   
...      ...            ...              ...             ...        ...   
13995      4     -22.230385         1.942653        0.689157   0.556584   
13996      4     -22.230385         1.919479        0.689157   0.987149   
13997      4     -22.230385         1.862236        0.317311   0.011137   
13998      4     -22.230385         1.856367        0.841481   0.548989   
13999      4     -22.230385         1.717977        0.071861   0.051254   

       autocorrelation      0      1      2      3  ...     90     91     92  \
0                 0

Scale all input data for fairness. Fit it on the training data only to prevent leakage. Only scale the nonbinary columns. (Later)

With this much data, we need multithreading. Get number of cores now.

In [8]:
import joblib

N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

jobs = N_CORES // 2
print(f"Using {jobs} jobs")

Number of physical cores: 2
Using 1 jobs


# model testing

For now, do a single run of a parallelized model. The goal is to beat 63%, and achieve >95%.

## SVM

Use the SGDClassifier with hinge loss to get an parallelized SVM. Input data should be scaled to avoid divergence.

In [9]:
%%time
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(n_jobs=jobs, random_state=42, verbose=1)
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


-- Epoch 1
Norm: 167.56, NNZs: 106, Bias: -12.791554, T: 11200, Avg. loss: 142.852770
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 115.68, NNZs: 106, Bias: -12.617004, T: 22400, Avg. loss: 35.906596
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 87.24, NNZs: 106, Bias: -13.269544, T: 33600, Avg. loss: 21.796820
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 76.89, NNZs: 106, Bias: -13.244764, T: 44800, Avg. loss: 15.583747
Total training time: 0.02 seconds.
-- Epoch 5
Norm: 61.37, NNZs: 106, Bias: -13.258531, T: 56000, Avg. loss: 12.281627
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 61.78, NNZs: 106, Bias: -13.597720, T: 67200, Avg. loss: 9.814250
Total training time: 0.02 seconds.
-- Epoch 7
Norm: 49.72, NNZs: 106, Bias: -13.732396, T: 78400, Avg. loss: 8.673080
Total training time: 0.03 seconds.
-- Epoch 8
Norm: 44.51, NNZs: 106, Bias: -13.730732, T: 89600, Avg. loss: 7.555530
Total training time: 0.03 seconds.
-- Epoch 9
Norm: 43.65, NNZs: 106, Bias: -13.940

Nonparallelized SVC that scales quadratically with samples

In [10]:
%%time
from sklearn.svm import LinearSVC
svc_model = LinearSVC(random_state=42, verbose=1)
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[LibLinear]iter  1 act 5.717e+03 pre 5.717e+03 delta 3.125e-02 f 1.120e+04 |g| 3.659e+05 CG   1
cg reaches trust region boundary
iter  2 act 7.514e+01 pre 7.514e+01 delta 1.250e-01 f 5.483e+03 |g| 2.533e+03 CG   1
cg reaches trust region boundary
iter  3 act 2.280e+02 pre 2.280e+02 delta 3.015e-01 f 5.407e+03 |g| 3.406e+03 CG   2
cg reaches trust region boundary
iter  4 act 1.890e+02 pre 1.812e+02 delta 3.188e-01 f 5.179e+03 |g| 1.842e+03 CG   4
iter  5 act 3.599e-01 pre 3.596e-01 delta 3.188e-01 f 4.990e+03 |g| 2.782e+03 CG   1
cg reaches trust region boundary
iter  6 act 3.358e+01 pre 3.222e+01 delta 4.016e-01 f 4.990e+03 |g| 2.519e+02 CG   4
cg reaches trust region boundary
iter  7 act 3.143e+01 pre 3.193e+01 delta 4.504e-01 f 4.957e+03 |g| 9.499e+02 CG   4
cg reaches trust region boundary
iter  8 act 3.334e+01 pre 3.115e+01 delta 5.509e-01 f 4.925e+03 |g| 6.184e+02 CG   5
iter  9 act 5.752e+00 pre 5.731e+00 delta 5.509e-01 f 4.892e+03 |g| 1.244e+03 CG   2
cg reaches trust region bo

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

# Perform hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
}

svm_model = SVC(random_state=42)
grid_search = GridSearchCV(svm_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)


from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

# Perform hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1],
    'kernel': ['linear', 'rbf'],
}

svm_model = SVC(random_state=42)
random_search = RandomizedSearchCV(svm_model, param_distributions=param_dist, n_iter=5, cv=5)
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)

## Random Forest

Min-entropy was previously calculated, no need to recompute it

In [11]:
%%time
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_jobs=jobs, verbose=1)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.8s


Accuracy: 0.6096428571428572
CPU times: user 3.83 s, sys: 103 ms, total: 3.93 s
Wall time: 4.8 s


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


from sklearn.ensemble import RandomForestClassifier

def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy




vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))
# Create the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train_with_entropy, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_with_entropy)

# Calculate the accuracy of the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt'],       # Number of features to consider when looking for the best split
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_rf = best_model.predict(X_test)

# Calculate the accuracy of the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print("Best Hyperparameters:", best_params)


## Gradient Boosting

In [12]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42, verbose=1)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

      Iter       Train Loss   Remaining Time 
         1           1.1081           54.22s
         2           1.0732           46.32s
         3           1.0457           43.20s
         4           1.0232           40.03s
         5           1.0045           38.48s
         6           0.9889           36.93s
         7           0.9760           35.85s
         8           0.9650           34.81s
         9           0.9558           33.92s
        10           0.9479           33.16s
        20           0.9029           28.39s
        30           0.8810           24.52s
        40           0.8637           20.64s
        50           0.8486           17.10s
        60           0.8348           13.67s
        70           0.8215           10.23s
        80           0.8098            6.82s
        90           0.7992            3.40s
       100           0.7890            0.00s
Accuracy: 0.6303571428571428
CPU times: user 33.5 s, sys: 107 ms, total: 33.6 s
Wall time: 34.1 s


A parallelized version is available as Histogram Gradient Boosting model

In [13]:
%%time
from sklearn.ensemble import HistGradientBoostingClassifier
hgb_model = HistGradientBoostingClassifier(random_state=42, verbose=1)
hgb_model.fit(X_train, y_train)
y_pred = hgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Binning 0.009 GB of training data: 0.093 s
Binning 0.001 GB of validation data: 0.003 s
Fitting gradient boosted rounds:
[1/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 1.07446, val loss: 1.08688, in 0.093s
[2/100] 4 trees, 124 leaves (31 on avg), max depth = 9, train loss: 1.01973, val loss: 1.04399, in 0.081s
[3/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.97837, val loss: 1.01259, in 0.072s
[4/100] 4 trees, 124 leaves (31 on avg), max depth = 11, train loss: 0.94601, val loss: 0.98833, in 0.079s
[5/100] 4 trees, 124 leaves (31 on avg), max depth = 12, train loss: 0.91969, val loss: 0.97225, in 0.059s
[6/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.89738, val loss: 0.95857, in 0.079s
[7/100] 4 trees, 124 leaves (31 on avg), max depth = 11, train loss: 0.87878, val loss: 0.94832, in 0.069s
[8/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.86165, val loss: 0.93910, in 0.063s
[9/100] 4 trees, 124 lea

from sklearn.ensemble import GradientBoostingClassifier



# Create the Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
# Train the model with the new feature included
gb_model.fit(X_train, y_train)

# Make predictions on the test set with the new feature included
y_pred_gb = gb_model.predict(X_test)

# Calculate the accuracy of the Gradient Boosting model with the new feature
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy with Min-Entropy Feature:", accuracy_gb)



GridSearch

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy




vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))


# Create the Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Perform Grid Search with cross-validation (cv=5) to find the best hyperparameters
grid_search = GridSearchCV(gb_model, param_grid, cv=5)
grid_search.fit(X_train_with_entropy, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_gb = best_model.predict(X_test_with_entropy)

# Calculate the accuracy of the Gradient Boosting model with the best hyperparameters
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Best Hyperparameters:", best_params)
print("Gradient Boosting Accuracy:", accuracy_gb)


Other gradient boosted tree methods may have different runtime/performance

## XGBoost

In [14]:
from xgboost import XGBClassifier
def do_xgb(tree_method):
    xgb_model = XGBClassifier(
        random_state = 42,
        verbosity = 1,
        n_jobs = jobs,
        tree_method = tree_method
    )
    xgb_model.fit(X_train, y_train - 1)
    y_pred = xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test - 1, y_pred)
    return [accuracy, xgb_model, y_pred]

In [15]:
%%time
accuracy, _, _ = do_xgb(tree_method='exact')
print("Accuracy:", accuracy)

Accuracy: 0.6321428571428571
CPU times: user 23.2 s, sys: 292 ms, total: 23.5 s
Wall time: 23.8 s


In [16]:
%%time
accuracy, _, _ = do_xgb(tree_method='approx')
print("Accuracy:", accuracy)

Accuracy: 0.6357142857142857
CPU times: user 15.5 s, sys: 118 ms, total: 15.6 s
Wall time: 16 s


In [17]:
%%time
accuracy, _, _ = do_xgb(tree_method='hist')
print("Accuracy:", accuracy)

Accuracy: 0.6267857142857143
CPU times: user 3.06 s, sys: 40.1 ms, total: 3.1 s
Wall time: 3.17 s


import xgboost as xgb
from sklearn.model_selection import GridSearchCV


# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Map classes to [0, 1, 2]
y_train_mapped = y_train - 1  # This will change classes [1, 2, 3] to [0, 1, 2]

# Continue with the Grid Search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5)
grid_search.fit(X_train, y_train_mapped)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set using the best XGBoost model
y_pred_xgb = best_xgb_model.predict(X_test)

# Calculate the accuracy of the XGBoost model with the best hyperparameters
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("Best Hyperparameters for XGBoost:", best_params)
print("XGBoost Accuracy:", accuracy_xgb)


## CatBoost

In [18]:
import lightgbm as lgb
def do_lgb(boosting_type):
    lgb_model = lgb.LGBMClassifier(
        random_state = 42,
        n_jobs = jobs,
        boosting_type = boosting_type,
    )
    lgb_model.fit(X_train, y_train)
    y_pred = lgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return [accuracy, lgb_model, y_pred]

In [19]:
%%time
accuracy, _, _ = do_lgb(boosting_type='gbdt')
print("Accuracy:", accuracy)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 997
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 106
[LightGBM] [Info] Start training from score -1.950295
[LightGBM] [Info] Start training from score -1.944661
[LightGBM] [Info] Start training from score -1.947161
[LightGBM] [Info] Start training from score -0.558523
Accuracy: 0.6410714285714286
CPU times: user 4.41 s, sys: 51.8 ms, total: 4.46 s
Wall time: 4.64 s


In [20]:
%%time
accuracy, _, _ = do_lgb(boosting_type='dart')
print("Accuracy:", accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 997
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 106
[LightGBM] [Info] Start training from score -1.950295
[LightGBM] [Info] Start training from score -1.944661
[LightGBM] [Info] Start training from score -1.947161
[LightGBM] [Info] Start training from score -0.558523
Accuracy: 0.6285714285714286
CPU times: user 3.27 s, sys: 31.8 ms, total: 3.3 s
Wall time: 3.32 s


# fails
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=jobs, boosting_type='rf')
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

import lightgbm as lgb

# Create the LightGBM classifier
lgb_model = lgb.LGBMClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Perform Grid Search with cross-validation (cv=5) to find the best hyperparameters
grid_search = GridSearchCV(lgb_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_lgb_model = grid_search.best_estimator_

# Make predictions on the test set using the best LightGBM model
y_pred_lgb = best_lgb_model.predict(X_test)

# Calculate the accuracy of the LightGBM model with the best hyperparameters
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print("Best Hyperparameters for LightGBM:", best_params)
print("LightGBM Accuracy:", accuracy_lgb)


## Neural Network

In [21]:
%%time
import numpy as np
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

keras.utils.set_random_seed(42)

y_train_integer = y_train-1 #.astype('float32')
y_test_integer = y_test-1 #.astype('float32')

X_train_small, X_val, y_train_integer, y_val_integer = train_test_split(X_train, y_train_integer, test_size=0.2, random_state=42)
X_train_small = X_train_small.astype('float32')
X_val = X_val.astype('float32')

# Create the Neural Network model
nn_model = Sequential()
nn_model.add(Input(shape=(X_train_small.shape[1],)))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(16, activation='relu'))
nn_model.add(Dense(4, activation='softmax'))

# Compile the model
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train_small, y_train_integer, epochs=25, batch_size=64, validation_data=(X_val, y_val_integer)) #, verbose=1)

# Make predictions on the test set
y_pred_probabilities = nn_model.predict(X_test.astype('float32'))
y_pred_nn = np.argmax(y_pred_probabilities, axis=-1)

# Calculate the accuracy of the Neural Network model
accuracy_nn = accuracy_score(y_test_integer, y_pred_nn)
print("Neural Network Accuracy:", accuracy_nn)


2024-06-04 17:35:30.740829: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-04 17:35:34.243935: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/25
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.3990 - loss: 1.7413 - val_accuracy: 0.5652 - val_loss: 1.1505
Epoch 2/25
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5828 - loss: 1.1075 - val_accuracy: 0.5723 - val_loss: 1.0678
Epoch 3/25
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5948 - loss: 1.0341 - val_accuracy: 0.5786 - val_loss: 1.0259
Epoch 4/25
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6043 - loss: 0.9992 - val_accuracy: 0.5763 - val_loss: 1.0097
Epoch 5/25
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6057 - loss: 0.9840 - val_accuracy: 0.5750 - val_loss: 1.0014
Epoch 6/25
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6057 - loss: 0.9742 - val_accuracy: 0.5790 - val_loss: 0.9935
Epoch 7/25
[1m140/140[0m 

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assuming you have already defined X_train, X_test, y_train, and y_test

# Convert binary numbers to integer labels
label_encoder = LabelEncoder()
y_train_integer = label_encoder.transform(y_train)
y_test_integer = label_encoder.transform(y_test)

# Check unique values in y_train_integer and y_test_integer
print("Unique values in y_train:", np.unique(y_train_integer))
print("Unique values in y_test:", np.unique(y_test_integer))

print("Shape of y_train_integer:", y_train_integer.shape)
print("Shape of y_test_integer:", y_test_integer.shape)

# Manually split the data into training and validation sets
X_train, X_val, y_train_integer, y_val_integer = train_test_split(X_train, y_train_integer, test_size=0.2, random_state=42)

# Create the Neural Network model
nn_model = Sequential()
nn_model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dense(16, activation='relu'))
nn_model.add(Dense(3, activation='softmax'))

# Compile the model
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train, y_train_integer, epochs=2, batch_size=64, validation_data=(X_val, y_val_integer), verbose=0)

# Make predictions on the test set
y_pred_probabilities = nn_model.predict(X_test)
y_pred_nn = np.argmax(y_pred_probabilities, axis=-1)

# Calculate the accuracy of the Neural Network model
accuracy_nn = accuracy_score(y_test_integer, y_pred_nn)
print("Neural Network Accuracy:", accuracy_nn)


## LSTM

In [22]:
%%time
import numpy as np
import pandas as pd
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

keras.utils.set_random_seed(42)

y_train_integer = y_train-1 #.astype('float32')
y_test_integer = y_test-1 #.astype('float32')

# Reshape the input data for LSTM
# TODO: LSTM with a single time step is not a sequence!
time_steps = 1  # Each sample is treated as a single time step
X_train_lstm = X_train.astype('float32').reshape(X_train.shape[0], time_steps, X_train.shape[1])
X_test_lstm = X_test.astype('float32').reshape(X_test.shape[0], time_steps, X_test.shape[1])
# y_train_lstm = y_train_integer.reshape(y_train_integer.shape[0], time_steps, y_train_integer.shape[1])

# Create the Neural Network model
lstm_model = Sequential()
lstm_model.add(Input(shape=(time_steps, X_train_lstm.shape[2])))
lstm_model.add(LSTM(32))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dense(4, activation='softmax'))

# Compile the model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_lstm, y_train_integer, epochs=25, batch_size=64)

# Make predictions on the test set
y_pred_probabilities = lstm_model.predict(X_test_lstm)
y_pred_lstm = np.argmax(y_pred_probabilities, axis=-1)

# Calculate the accuracy of the LSTM model
accuracy_lstm = accuracy_score(y_test_integer, y_pred_lstm)
print("LSTM Accuracy:", accuracy_lstm)

Epoch 1/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.5296 - loss: 1.1914
Epoch 2/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5808 - loss: 1.0523
Epoch 3/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6095 - loss: 0.9664
Epoch 4/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6198 - loss: 0.9398
Epoch 5/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6228 - loss: 0.9285
Epoch 6/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6249 - loss: 0.9221
Epoch 7/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6304 - loss: 0.9155
Epoch 8/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6306 - loss: 0.9086
Epoch 9/25
[1m175/175[0m [32m━━━━━━━━

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ... (Previous code for reading and preprocessing the data)

# Convert the data into numerical format
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

# Reshape the input data for LSTM
time_steps = 1  # Each sample is treated as a single time step
X_train_lstm = X_train.reshape(X_train.shape[0], time_steps, X_train.shape[1])
X_test_lstm = X_test.reshape(X_test.shape[0], time_steps, X_test.shape[1])

# Create the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(32, input_shape=(time_steps, X_train.shape[1])))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dense(3, activation='sigmoid'))  # Assuming binary classification

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=1)

# Make predictions on the test set
y_pred_lstm = lstm_model.predict(X_test_lstm)
y_pred_lstm = np.round(y_pred_lstm).astype(int).flatten()  # Convert probabilities to binary predictions

# Calculate the accuracy of the LSTM model
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
print("LSTM Accuracy:", accuracy_lstm)
