In [1]:
%config Completer.use_jedi = False
# fixes firefox tab completion

In [2]:
# Importing the required libraries
import numpy as np
import pandas as pd
from math import log2, sqrt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.fft import fft, ifft
from scipy.special import erfc

In [3]:


# Concatenate data
def concatenateData(df, num_concats):
    if num_concats == 0: # do nothing
        return df.rename(columns={'binary_number': 'Concatenated_Data'})
    new_df = pd.DataFrame({
        'Concatenated_Data': [''] * (len(df) // num_concats), 
        'label': [0] * (len(df) // num_concats)
    })

    # Loop through each group of num_concats rows and concatenate their 'binary_number' strings
    for i in range(0, len(df), num_concats):
        new_df.iloc[i // num_concats, 0] = ''.join(df['binary_number'][i:i + num_concats])
        new_df.iloc[i // num_concats, 1] = df['label'][i]

    return new_df

# Calculate Shannon entropy for each concatenated binary sequence
def shannon_entropy(binary_string):
    if len(binary_string) % 2 != 0:
        raise ValueError("Binary string length must be a multiple of 2.")
    
    patterns = ['00', '10', '11', '01']
    frequency = {pattern: 0 for pattern in patterns}
    
    for i in range(0, len(binary_string), 2):
        segment = binary_string[i:i+2]
        if segment in patterns:
            frequency[segment] += 1
    
    total_segments = sum(frequency.values())
    
    entropy = 0
    for count in frequency.values():
        if count > 0:
            probability = count / total_segments
            entropy -= probability * log2(probability)
    
    return entropy


def classic_spectral_test(bit_string):
    bit_array = 2 * np.array([int(bit) for bit in bit_string]) - 1
    dft = fft(bit_array)
    n_half = len(bit_string) // 2 + 1
    mod_dft = np.abs(dft[:n_half])
    threshold = np.sqrt(np.log(1 / 0.05) / len(bit_string))
    peaks_below_threshold = np.sum(mod_dft < threshold)
    expected_peaks = 0.95 * n_half
    d = (peaks_below_threshold - expected_peaks) / np.sqrt(len(bit_string) * 0.95 * 0.05)
    p_value = erfc(np.abs(d) / np.sqrt(2)) / 2
    return d

def frequency_test(bit_string):
    n = len(bit_string)
    count_ones = bit_string.count('1')
    count_zeros = bit_string.count('0')
    
    # The test statistic
    s = (count_ones - count_zeros) / sqrt(n)
    
    # The p-value
    p_value = erfc(abs(s) / sqrt(2))
    
    return p_value

def runs_test(bit_string):
    n = len(bit_string)
    runs = 1  # Start with the first run
    for i in range(1, n):
        if bit_string[i] != bit_string[i - 1]:
            runs += 1
    
    n0 = bit_string.count('0')
    n1 = bit_string.count('1')
    
    # Expected number of runs
    expected_runs = (2 * n0 * n1 / n) + 1
    variance_runs = (2 * n0 * n1 * (2 * n0 * n1 - n)) / (n ** 2 * (n - 1))
    
    # The test statistic
    z = (runs - expected_runs) / sqrt(variance_runs)
    
    # The p-value
    p_value = erfc(abs(z) / sqrt(2))
    
    return p_value

def linear_complexity(bit_string, M=500):
    # Perform linear complexity test with block size M
    n = len(bit_string)
    bit_array = np.array([int(bit) for bit in bit_string])
    lc = 0  # Initialize linear complexity
    
    # Process blocks of size M
    for i in range(0, n, M):
        block = bit_array[i:i+M]
        if len(block) < M:
            continue
        
        lc_block = 0
        for j in range(M):
            if block[j] == 1:
                lc_block = j + 1
        
        lc += lc_block
    
    lc = lc / (n / M)
    return lc

def autocorrelation_test(bit_string, lag=1):
    n = len(bit_string)
    bit_array = np.array([int(bit) for bit in bit_string])
    autocorrelation = np.correlate(bit_array, np.roll(bit_array, lag), mode='valid')[0]
    return autocorrelation / n

def maurer_universal_test(bit_string):
    k = 6
    l = 5
    q = 20
    bit_array = np.array([int(bit) for bit in bit_string])
    max_val = 2**k
    init_subseq = bit_array[:q]
    rest_subseq = bit_array[q:]
    d = {}
    for i in range(len(init_subseq) - k + 1):
        d[tuple(init_subseq[i:i+k])] = i
    t = []
    for i in range(len(rest_subseq) - k + 1):
        subseq = tuple(rest_subseq[i:i+k])
        if subseq in d:
            t.append(i - d[subseq])
            d[subseq] = i
    if not t:
        return 0
    t = np.array(t)
    log_avg = np.mean(np.log2(t))
    return log_avg - np.log2(q)

def binary_matrix_rank_test(bit_string, M=32, Q=32):
    bit_array = np.array([int(bit) for bit in bit_string])
    num_matrices = len(bit_array) // (M * Q)
    ranks = []
    for i in range(num_matrices):
        matrix = bit_array[i*M*Q:(i+1)*M*Q].reshape((M, Q))
        rank = np.linalg.matrix_rank(matrix)
        ranks.append(rank)
    return np.mean(ranks)

def cumulative_sums_test(bit_string):
    bit_array = np.array([int(bit) for bit in bit_string])
    adjusted = 2 * bit_array - 1
    cumulative_sum = np.cumsum(adjusted)
    max_excursion = np.max(np.abs(cumulative_sum))
    return max_excursion

def longest_run_ones_test(bit_string, block_size=100):
    bit_array = np.array([int(bit) for bit in bit_string])
    num_blocks = len(bit_array) // block_size
    max_runs = []
    for i in range(num_blocks):
        block = bit_array[i*block_size:(i+1)*block_size]
        max_run = max([len(list(g)) for k, g in itertools.groupby(block) if k == 1])
        max_runs.append(max_run)
    return np.mean(max_runs)

def random_excursions_test(bit_string):
    bit_array = np.array([int(bit) for bit in bit_string])
    bit_array = 2 * bit_array - 1  # Convert to ±1

    cumulative_sum = np.cumsum(bit_array)
    states = np.unique(cumulative_sum)

    if 0 not in states:
        states = np.append(states, 0)
    state_counts = {state: 0 for state in states}
    for state in cumulative_sum:
        state_counts[state] += 1

    state_counts[0] -= 1  # Adjust for zero state
    pi = [0.5 * (1 - (1 / (2 * state + 1)**2)) for state in states]
    x = np.sum([(state_counts[state] - len(bit_string) * pi[i])**2 / (len(bit_string) * pi[i]) for i, state in enumerate(states)])

    return x


def unique_subsequences(bit_string, length=4):
    bit_array = np.array([int(bit) for bit in bit_string])
    n = len(bit_array)
    subsequences = set()
    
    for i in range(n - length + 1):
        subseq = tuple(bit_array[i:i+length])
        subsequences.add(subseq)
    
    return len(subsequences)

def sample_entropy(bit_string, m=2, r=0.2):
    bit_array = np.array([int(bit) for bit in bit_string])
    N = len(bit_array)
    
    def _phi(m):
        x = np.array([bit_array[i:i+m] for i in range(N - m + 1)])
        C = np.sum(np.all(np.abs(x[:, None] - x) <= r, axis=2), axis=0) / (N - m + 1.0)
        return np.sum(C) / (N - m + 1.0)
    
    return -np.log(_phi(m + 1) / _phi(m))

def permutation_entropy(bit_string, order=3):
    bit_array = np.array([int(bit) for bit in bit_string])
    n = len(bit_array)
    
    permutations = np.array(list(itertools.permutations(range(order))))
    c = np.zeros(len(permutations))
    
    for i in range(n - order + 1):
        sorted_index_array = tuple(np.argsort(bit_array[i:i+order]))
        for j, p in enumerate(permutations):
            if np.array_equal(p, sorted_index_array):
                c[j] += 1
    
    c = c / (n - order + 1)
    pe = -np.sum(c * np.log2(c + np.finfo(float).eps))
    return pe

def lyapunov_exponent(bit_string, m=2, t=1):
    bit_array = np.array([int(bit) for bit in bit_string])
    N = len(bit_array)
    
    def _phi(m):
        x = np.array([bit_array[i:i+m] for i in range(N - m + 1)])
        C = np.sum(np.all(np.abs(x[:, None] - x) <= t, axis=2), axis=0) / (N - m + 1.0)
        return np.sum(np.log(C + np.finfo(float).eps)) / (N - m + 1.0)
    
    return abs(_phi(m) - _phi(m + 1))

def entropy_rate(bit_string, k=2):
    bit_array = np.array([int(bit) for bit in bit_string])
    n = len(bit_array)
    prob = {}
    
    for i in range(n - k + 1):
        subseq = tuple(bit_array[i:i + k])
        if subseq in prob:
            prob[subseq] += 1
        else:
            prob[subseq] = 1
    
    for key in prob:
        prob[key] /= (n - k + 1)
    
    entropy_rate = -sum(p * log2(p) for p in prob.values())
    return entropy_rate

# Apply randomness tests
def apply_randomness_tests(df, tests):
    if not tests:
        raise ValueError("No randomness tests specified.")

    test_functions = {
        'autocorrelation': autocorrelation_test,
        'cumulative_sums': cumulative_sums_test,
        'spectral_test': classic_spectral_test,
        'frequency_test': frequency_test,
        'runs_test': runs_test,
        'shannon_entropy': shannon_entropy
    }

    for test in tests:
        if test not in test_functions:
            raise ValueError(f"Invalid randomness test: {test}")
        df[test] = df['Concatenated_Data'].apply(test_functions[test])

    return df

# Preprocess data
def preprocess_data(df, num_concats, tests):
    df = concatenateData(df, num_concats)
    processed_df = apply_randomness_tests(df, tests)
    
    # Convert concatenated binary strings into separate columns
    df_features = pd.DataFrame(processed_df['Concatenated_Data'].apply(list).tolist()).astype(int).astype(bool)
    processed_df = pd.concat([processed_df.drop(columns='Concatenated_Data'), df_features], axis=1)

    return processed_df

# Calculate min-entropy
def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy



In [4]:
# Main
file_path = 'AI_2qubits_training_data.txt'

# Read the data from the file
data = []
with open(file_path, 'r') as file:
    for line in file:
        if line.strip():
            binary_number, label = line.strip().split()
            data.append((binary_number, int(label)))

# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['binary_number', 'label'])


In [5]:
# try generating more training data - any sequence of random bits from the bit string also came from the quantum computer

def binary_series_to_string(x):
    return ''.join(x['binary_number'].to_list())

# work around deprecation warning
squashed_string_df = df.groupby('label')[['label','binary_number']].apply(binary_series_to_string)

def window(word, size=1, gap=1): return [word[i:i+size] for i in range(0, len(word)-size + 1, gap)]

for i in range(len(squashed_string_df)):
    squashed_string_df.iloc[i] = window(squashed_string_df.iloc[i], 100, 2)  # sample 2 qubits at a time

# TODO: assume bits are processed in a single stream - k-fold cross validation should split into k chunks ahead of time to preserve time dependency information?
df = pd.DataFrame(squashed_string_df.explode(), columns=['Concatenated_Data']).reset_index()

In [6]:
tests_to_apply = ['spectral_test', 'shannon_entropy', 'frequency_test', 'runs_test', 'autocorrelation']

# Preprocess data and apply randomness tests
preprocessed_df = preprocess_data(df, num_concats=0, tests=tests_to_apply)

In [7]:
preprocessed_df[preprocessed_df.select_dtypes(np.float64).columns] = preprocessed_df.select_dtypes(np.float64).astype(np.float32)

In [8]:
preprocessed_df

Unnamed: 0,label,spectral_test,shannon_entropy,frequency_test,runs_test,autocorrelation,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,1,-21.771553,1.935451,0.423711,0.120217,0.33,False,True,False,False,...,True,True,True,True,True,True,False,False,True,False
1,1,-21.771553,1.935451,0.423711,0.079056,0.33,False,False,True,True,...,True,True,True,True,False,False,True,False,False,True
2,1,-22.230385,1.945597,0.317311,0.084331,0.34,True,True,True,True,...,True,True,False,False,True,False,False,True,True,False
3,1,-21.771553,1.963233,0.423711,0.120217,0.33,True,True,True,True,...,False,False,True,False,False,True,True,False,False,True
4,1,-22.230385,1.976281,0.548506,0.168839,0.31,True,True,True,True,...,True,False,False,True,True,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699799,4,-22.230385,1.702839,0.027807,0.109146,0.19,False,False,True,True,...,True,False,True,True,True,True,True,True,False,False
699800,4,-22.230385,1.702839,0.027807,0.069771,0.19,True,True,False,True,...,True,True,True,True,True,True,False,False,False,False
699801,4,-22.230385,1.702839,0.027807,0.069771,0.19,False,True,False,False,...,True,True,True,True,False,False,False,False,True,True
699802,4,-22.230385,1.702839,0.027807,0.069771,0.19,False,False,False,False,...,True,True,False,False,False,False,True,True,False,True


Train/test splits should not be shuffled since the order that the bits were sampled matters. However, we need to take splits from each of the quantum computers, so the splits really should be interleaved. 

Later we'll use k-fold CV to get a better result.

In [9]:
# Split the data into features (X) and labels (y)
X = preprocessed_df.drop(columns='label').values
y = preprocessed_df['label'].values.astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate min-entropy for each sequence in the training and testing datasets
# min-entropy also takes into account the bitstring! is this ok?
min_entropy_train = np.apply_along_axis(calculate_min_entropy, 1, X_train)
min_entropy_test = np.apply_along_axis(calculate_min_entropy, 1, X_test)

# Add min-entropy as a feature
X_train = np.column_stack((X_train, min_entropy_train))
X_test = np.column_stack((X_test, min_entropy_test))

print(preprocessed_df)
print(X_train)
print(X_test)

        label  spectral_test  shannon_entropy  frequency_test  runs_test  \
0           1     -21.771553         1.935451        0.423711   0.120217   
1           1     -21.771553         1.935451        0.423711   0.079056   
2           1     -22.230385         1.945597        0.317311   0.084331   
3           1     -21.771553         1.963233        0.423711   0.120217   
4           1     -22.230385         1.976281        0.548506   0.168839   
...       ...            ...              ...             ...        ...   
699799      4     -22.230385         1.702839        0.027807   0.109146   
699800      4     -22.230385         1.702839        0.027807   0.069771   
699801      4     -22.230385         1.702839        0.027807   0.069771   
699802      4     -22.230385         1.702839        0.027807   0.069771   
699803      4     -22.230385         1.717977        0.071861   0.051254   

        autocorrelation      0      1      2      3  ...     90     91     92  \
0     

Scale all input data for fairness. Fit it on the training data only to prevent leakage. Only scale the nonbinary columns. (Later)

With this much data, we need multithreading. Get number of cores now.

In [10]:
import joblib

N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

jobs = N_CORES // 2
print(f"Using {jobs} jobs")

Number of physical cores: 4
Using 2 jobs


# model testing

For now, do a single run of a parallelized model. The goal is to beat 63%, and achieve >95%.

## SVM

Use the SGDClassifier with hinge loss to get an parallelized SVM. Input data should be scaled to avoid divergence.

In [11]:
%%time
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(n_jobs=jobs, random_state=42, verbose=1)
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1
-- Epoch 1
Norm: 24.87, NNZs: 106, Bias: -15.478192, T: 559843, Avg. loss: 7.349713
Total training time: 0.18 seconds.
-- Epoch 2
Norm: 18.70, NNZs: 106, Bias: -18.909748, T: 559843, Avg. loss: 6.605183
Total training time: 0.19 seconds.
-- Epoch 2
Norm: 18.16, NNZs: 106, Bias: -16.298848, T: 1119686, Avg. loss: 1.056295
Total training time: 0.36 seconds.
-- Epoch 3
Norm: 13.51, NNZs: 106, Bias: -19.523071, T: 1119686, Avg. loss: 0.976348
Total training time: 0.38 seconds.
-- Epoch 3
Norm: 14.89, NNZs: 106, Bias: -16.735968, T: 1679529, Avg. loss: 0.746303
Total training time: 0.55 seconds.
-- Epoch 4
Norm: 11.01, NNZs: 106, Bias: -19.807212, T: 1679529, Avg. loss: 0.698437
Total training time: 0.57 seconds.
-- Epoch 4
Norm: 12.87, NNZs: 106, Bias: -17.018356, T: 2239372, Avg. loss: 0.621026
Total training time: 0.73 seconds.
-- Epoch 5
Norm: 9.44, NNZs: 106, Bias: -19.982463, T: 2239372, Avg. loss: 0.585542
Total training time: 0.76 seconds.
-- Epoch 5
Norm: 11.44, NNZs: 10

Norm: 3.30, NNZs: 106, Bias: -19.235707, T: 19034662, Avg. loss: 0.330122
Total training time: 6.59 seconds.
-- Epoch 35
Norm: 3.23, NNZs: 106, Bias: -16.304192, T: 20714191, Avg. loss: 0.327618
Total training time: 6.75 seconds.
-- Epoch 38
Norm: 3.26, NNZs: 106, Bias: -19.187580, T: 19594505, Avg. loss: 0.328983
Total training time: 6.78 seconds.
-- Epoch 36
Norm: 3.17, NNZs: 106, Bias: -16.239857, T: 21274034, Avg. loss: 0.326646
Total training time: 6.93 seconds.
-- Epoch 39
Norm: 3.20, NNZs: 106, Bias: -19.143339, T: 20154348, Avg. loss: 0.327817
Total training time: 6.97 seconds.
-- Epoch 37
Norm: 3.12, NNZs: 106, Bias: -16.176292, T: 21833877, Avg. loss: 0.325790
Total training time: 7.11 seconds.
-- Epoch 40
Norm: 3.16, NNZs: 106, Bias: -19.096856, T: 20714191, Avg. loss: 0.326790
Total training time: 7.16 seconds.
-- Epoch 38
Norm: 3.05, NNZs: 106, Bias: -16.113901, T: 22393720, Avg. loss: 0.324960
Total training time: 7.28 seconds.
-- Epoch 41
Norm: 3.11, NNZs: 106, Bias: -19

Norm: 18.05, NNZs: 106, Bias: 26.880358, T: 13436232, Avg. loss: 0.670756
Total training time: 5.27 seconds.
-- Epoch 25
Norm: 11.06, NNZs: 106, Bias: -8.503478, T: 19034662, Avg. loss: 0.308395
Total training time: 6.05 seconds.
Convergence after 34 epochs took 6.05 seconds
Norm: 18.01, NNZs: 106, Bias: 26.894263, T: 13996075, Avg. loss: 0.667337
Total training time: 5.48 seconds.
-- Epoch 26
Norm: 17.96, NNZs: 106, Bias: 26.906848, T: 14555918, Avg. loss: 0.665278
Total training time: 5.68 seconds.
-- Epoch 27
Norm: 17.94, NNZs: 106, Bias: 26.915650, T: 15115761, Avg. loss: 0.662719
Total training time: 5.89 seconds.
-- Epoch 28
Norm: 17.91, NNZs: 106, Bias: 26.923469, T: 15675604, Avg. loss: 0.660087
Total training time: 6.09 seconds.
-- Epoch 29
Norm: 17.88, NNZs: 106, Bias: 26.933494, T: 16235447, Avg. loss: 0.658829
Total training time: 6.29 seconds.
-- Epoch 30
Norm: 17.85, NNZs: 106, Bias: 26.940670, T: 16795290, Avg. loss: 0.656329
Total training time: 6.50 seconds.
-- Epoch 3

[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:   19.2s finished


Accuracy: 0.6382349368752724
CPU times: user 34.5 s, sys: 1.52 s, total: 36 s
Wall time: 22.3 s


Nonparallelized SVC that scales quadratically with samples

In [12]:
%%time
from sklearn.svm import LinearSVC
svc_model = LinearSVC(random_state=42, verbose=1)
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[LibLinear]iter  1 act 2.847e+05 pre 2.847e+05 delta 3.119e-02 f 5.598e+05 |g| 1.826e+07 CG   1
cg reaches trust region boundary
iter  2 act 3.585e+03 pre 3.585e+03 delta 1.247e-01 f 2.751e+05 |g| 1.214e+05 CG   1
cg reaches trust region boundary
iter  3 act 1.068e+04 pre 1.068e+04 delta 2.847e-01 f 2.715e+05 |g| 1.628e+05 CG   2
cg reaches trust region boundary
iter  4 act 8.020e+03 pre 7.753e+03 delta 2.980e-01 f 2.608e+05 |g| 8.419e+04 CG   3
iter  5 act 4.262e+02 pre 4.155e+02 delta 2.980e-01 f 2.528e+05 |g| 1.037e+05 CG   2
cg reaches trust region boundary
iter  6 act 1.436e+03 pre 1.424e+03 delta 4.560e-01 f 2.524e+05 |g| 1.629e+04 CG   3
cg reaches trust region boundary
iter  7 act 1.691e+03 pre 1.705e+03 delta 5.279e-01 f 2.510e+05 |g| 1.714e+04 CG   4
cg reaches trust region boundary
iter  8 act 1.906e+03 pre 1.819e+03 delta 7.346e-01 f 2.493e+05 |g| 3.919e+04 CG   5
cg reaches trust region boundary
iter  9 act 1.269e+03 pre 1.237e+03 delta 9.090e-01 f 2.474e+05 |g| 2.163e+04 

cg reaches trust region boundary
iter  5 act 4.403e+04 pre 4.403e+04 delta 5.796e-01 f 5.241e+05 |g| 2.327e+05 CG   2
cg reaches trust region boundary
iter  6 act 3.146e+04 pre 3.145e+04 delta 8.367e-01 f 4.801e+05 |g| 1.001e+05 CG   2
cg reaches trust region boundary
iter  7 act 1.111e+04 pre 1.109e+04 delta 1.008e+00 f 4.486e+05 |g| 4.690e+04 CG   3
cg reaches trust region boundary
iter  8 act 9.898e+03 pre 9.906e+03 delta 1.156e+00 f 4.375e+05 |g| 3.583e+04 CG   5
cg reaches trust region boundary
iter  9 act 1.024e+04 pre 1.006e+04 delta 1.662e+00 f 4.276e+05 |g| 3.130e+04 CG   5
cg reaches trust region boundary
iter 10 act 6.485e+03 pre 6.401e+03 delta 2.000e+00 f 4.174e+05 |g| 7.912e+04 CG   7
iter 11 act 1.149e+03 pre 1.141e+03 delta 2.000e+00 f 4.109e+05 |g| 5.068e+04 CG   4
cg reaches trust region boundary
iter 12 act 4.063e+03 pre 4.059e+03 delta 2.758e+00 f 4.098e+05 |g| 8.104e+03 CG   5
iter 13 act 1.209e+03 pre 1.205e+03 delta 2.758e+00 f 4.057e+05 |g| 6.625e+04 CG   3
cg r

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

# Perform hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
}

svm_model = SVC(random_state=42)
grid_search = GridSearchCV(svm_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)


from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

# Perform hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1],
    'kernel': ['linear', 'rbf'],
}

svm_model = SVC(random_state=42)
random_search = RandomizedSearchCV(svm_model, param_distributions=param_dist, n_iter=5, cv=5)
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)

## Random Forest

Min-entropy was previously calculated, no need to recompute it

In [13]:
%%time
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_jobs=jobs, verbose=1)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   32.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s


Accuracy: 0.6291609805588699
CPU times: user 2min 17s, sys: 3.55 s, total: 2min 20s
Wall time: 1min 11s


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.1s finished


from sklearn.ensemble import RandomForestClassifier

def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy




vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))
# Create the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train_with_entropy, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_with_entropy)

# Calculate the accuracy of the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt'],       # Number of features to consider when looking for the best split
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_rf = best_model.predict(X_test)

# Calculate the accuracy of the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print("Best Hyperparameters:", best_params)


## Gradient Boosting

In [14]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42, verbose=1)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

      Iter       Train Loss   Remaining Time 
         1           1.1108           24.04m
         2           1.0778           22.78m
         3           1.0512           21.91m
         4           1.0299           21.41m
         5           1.0125           21.07m
         6           0.9977           20.75m
         7           0.9857           20.53m
         8           0.9756           20.37m
         9           0.9667           20.15m
        10           0.9587           19.91m
        20           0.9207           17.41m
        30           0.9086           15.15m
        40           0.9003           12.95m
        50           0.8930           10.74m
        60           0.8868            8.56m
        70           0.8815            6.40m
        80           0.8770            4.26m
        90           0.8723            2.13m
       100           0.8683            0.00s
Accuracy: 0.6461299933552919
CPU times: user 21min 18s, sys: 1.02 s, total: 21min 19s
Wall time: 21

A parallelized version is available as Histogram Gradient Boosting model

In [15]:
%%time
from sklearn.ensemble import HistGradientBoostingClassifier
hgb_model = HistGradientBoostingClassifier(random_state=42, verbose=1)
hgb_model.fit(X_train, y_train)
y_pred = hgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Binning 0.427 GB of training data: 0.978 s
Binning 0.047 GB of validation data: 0.038 s
Fitting gradient boosted rounds:
[1/100] 4 trees, 124 leaves (31 on avg), max depth = 7, train loss: 1.08670, val loss: 1.08687, in 0.420s
[2/100] 4 trees, 124 leaves (31 on avg), max depth = 7, train loss: 1.04099, val loss: 1.04134, in 0.420s
[3/100] 4 trees, 124 leaves (31 on avg), max depth = 7, train loss: 1.00740, val loss: 1.00793, in 0.421s
[4/100] 4 trees, 124 leaves (31 on avg), max depth = 7, train loss: 0.98205, val loss: 0.98271, in 0.427s
[5/100] 4 trees, 124 leaves (31 on avg), max depth = 8, train loss: 0.96280, val loss: 0.96365, in 0.423s
[6/100] 4 trees, 124 leaves (31 on avg), max depth = 8, train loss: 0.94763, val loss: 0.94871, in 0.426s
[7/100] 4 trees, 124 leaves (31 on avg), max depth = 9, train loss: 0.93574, val loss: 0.93690, in 0.460s
[8/100] 4 trees, 124 leaves (31 on avg), max depth = 9, train loss: 0.92626, val loss: 0.92751, in 0.429s
[9/100] 4 trees, 124 leaves (31

[77/100] 4 trees, 124 leaves (31 on avg), max depth = 9, train loss: 0.82255, val loss: 0.83665, in 0.543s
[78/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.82194, val loss: 0.83622, in 0.498s
[79/100] 4 trees, 124 leaves (31 on avg), max depth = 9, train loss: 0.82129, val loss: 0.83574, in 0.543s
[80/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.82068, val loss: 0.83532, in 0.464s
[81/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.82014, val loss: 0.83496, in 0.455s
[82/100] 4 trees, 124 leaves (31 on avg), max depth = 10, train loss: 0.81950, val loss: 0.83451, in 0.528s
[83/100] 4 trees, 124 leaves (31 on avg), max depth = 13, train loss: 0.81895, val loss: 0.83413, in 0.498s
[84/100] 4 trees, 124 leaves (31 on avg), max depth = 9, train loss: 0.81837, val loss: 0.83377, in 0.475s
[85/100] 4 trees, 124 leaves (31 on avg), max depth = 8, train loss: 0.81774, val loss: 0.83328, in 0.539s
[86/100] 4 trees, 124 leaves (31

from sklearn.ensemble import GradientBoostingClassifier



# Create the Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
# Train the model with the new feature included
gb_model.fit(X_train, y_train)

# Make predictions on the test set with the new feature included
y_pred_gb = gb_model.predict(X_test)

# Calculate the accuracy of the Gradient Boosting model with the new feature
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy with Min-Entropy Feature:", accuracy_gb)



GridSearch

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy




vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))


# Create the Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Perform Grid Search with cross-validation (cv=5) to find the best hyperparameters
grid_search = GridSearchCV(gb_model, param_grid, cv=5)
grid_search.fit(X_train_with_entropy, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_gb = best_model.predict(X_test_with_entropy)

# Calculate the accuracy of the Gradient Boosting model with the best hyperparameters
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Best Hyperparameters:", best_params)
print("Gradient Boosting Accuracy:", accuracy_gb)


Other gradient boosted tree methods may have different runtime/performance

## XGBoost

In [16]:
%%time
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42, verbosity=1, n_jobs=jobs, tree_method='exact')
y_train_mapped = y_train-1
y_test_mapped = y_test-1
xgb_model.fit(X_train, y_train_mapped)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test_mapped, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6704867784597138
CPU times: user 13min 22s, sys: 464 ms, total: 13min 22s
Wall time: 6min 42s


In [17]:
%%time
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42, verbosity=1, n_jobs=jobs, tree_method='approx')
y_train_mapped = y_train-1
y_test_mapped = y_test-1
xgb_model.fit(X_train, y_train_mapped)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test_mapped, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6709726280892534
CPU times: user 10min 8s, sys: 7.74 s, total: 10min 15s
Wall time: 5min 9s


In [18]:
%%time
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42, verbosity=1, n_jobs=jobs, tree_method='hist')
y_train_mapped = y_train-1
y_test_mapped = y_test-1
xgb_model.fit(X_train, y_train_mapped)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test_mapped, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6712226977515164
CPU times: user 1min 11s, sys: 152 ms, total: 1min 11s
Wall time: 36.7 s


import xgboost as xgb
from sklearn.model_selection import GridSearchCV


# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Map classes to [0, 1, 2]
y_train_mapped = y_train - 1  # This will change classes [1, 2, 3] to [0, 1, 2]

# Continue with the Grid Search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5)
grid_search.fit(X_train, y_train_mapped)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set using the best XGBoost model
y_pred_xgb = best_xgb_model.predict(X_test)

# Calculate the accuracy of the XGBoost model with the best hyperparameters
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("Best Hyperparameters for XGBoost:", best_params)
print("XGBoost Accuracy:", accuracy_xgb)


## CatBoost

In [19]:
%%time
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=jobs, boosting_type='gbdt')
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1013
[LightGBM] [Info] Number of data points in the train set: 559843, number of used features: 106
[LightGBM] [Info] Start training from score -1.945730
[LightGBM] [Info] Start training from score -1.946455
[LightGBM] [Info] Start training from score -1.945855
[LightGBM] [Info] Start training from score -0.559539
Accuracy: 0.6617486299754932
CPU times: user 1min 5s, sys: 620 ms, total: 1min 6s
Wall time: 34.7 s


In [20]:
%%time
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=jobs, boosting_type='dart')
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1013
[LightGBM] [Info] Number of data points in the train set: 559843, number of used features: 106
[LightGBM] [Info] Start training from score -1.945730
[LightGBM] [Info] Start training from score -1.946455
[LightGBM] [Info] Start training from score -1.945855
[LightGBM] [Info] Start training from score -0.559539
Accuracy: 0.6453154807410636
CPU times: user 1min 59s, sys: 636 ms, total: 2min
Wall time: 1min 1s


# fails
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=jobs, boosting_type='rf')
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

import lightgbm as lgb

# Create the LightGBM classifier
lgb_model = lgb.LGBMClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Perform Grid Search with cross-validation (cv=5) to find the best hyperparameters
grid_search = GridSearchCV(lgb_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_lgb_model = grid_search.best_estimator_

# Make predictions on the test set using the best LightGBM model
y_pred_lgb = best_lgb_model.predict(X_test)

# Calculate the accuracy of the LightGBM model with the best hyperparameters
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print("Best Hyperparameters for LightGBM:", best_params)
print("LightGBM Accuracy:", accuracy_lgb)


## Neural Network

In [21]:
%%time
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

keras.utils.set_random_seed(42)

y_train_integer = y_train-1 #.astype('float32')
y_test_integer = y_test-1 #.astype('float32')

X_train_small, X_val, y_train_integer, y_val_integer = train_test_split(X_train, y_train_integer, test_size=0.2, random_state=42)
X_train_small = X_train_small.astype('float32')
X_val = X_val.astype('float32')

# Create the Neural Network model
nn_model = Sequential()
nn_model.add(Input(shape=(X_train_small.shape[1],)))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(16, activation='relu'))
nn_model.add(Dense(4, activation='softmax'))

# Compile the model
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train_small, y_train_integer, epochs=25, batch_size=64, validation_data=(X_val, y_val_integer)) #, verbose=1)

# Make predictions on the test set
y_pred_probabilities = nn_model.predict(X_test.astype('float32'))
y_pred_nn = np.argmax(y_pred_probabilities, axis=-1)

# Calculate the accuracy of the Neural Network model
accuracy_nn = accuracy_score(y_test_integer, y_pred_nn)
print("Neural Network Accuracy:", accuracy_nn)


2024-06-01 00:19:53.839222: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-01 00:19:53.840532: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-01 00:19:53.897827: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-01 00:19:54.121602: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/25
[1m6999/6999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 651us/step - accuracy: 0.5955 - loss: 1.0133 - val_accuracy: 0.6251 - val_loss: 0.9119
Epoch 2/25
[1m6999/6999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 641us/step - accuracy: 0.6243 - loss: 0.9103 - val_accuracy: 0.6213 - val_loss: 0.9240
Epoch 3/25
[1m6999/6999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 653us/step - accuracy: 0.6372 - loss: 0.8796 - val_accuracy: 0.6212 - val_loss: 0.9333
Epoch 4/25
[1m6999/6999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 652us/step - accuracy: 0.6432 - loss: 0.8672 - val_accuracy: 0.6235 - val_loss: 0.9291
Epoch 5/25
[1m6999/6999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 667us/step - accuracy: 0.6461 - loss: 0.8618 - val_accuracy: 0.6295 - val_loss: 0.9107
Epoch 6/25
[1m6999/6999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24563s[0m 4s/step - accuracy: 0.6482 - loss: 0.8574 - val_accuracy: 0.6433 - val_loss: 0.8685
Epo

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assuming you have already defined X_train, X_test, y_train, and y_test

# Convert binary numbers to integer labels
label_encoder = LabelEncoder()
y_train_integer = label_encoder.transform(y_train)
y_test_integer = label_encoder.transform(y_test)

# Check unique values in y_train_integer and y_test_integer
print("Unique values in y_train:", np.unique(y_train_integer))
print("Unique values in y_test:", np.unique(y_test_integer))

print("Shape of y_train_integer:", y_train_integer.shape)
print("Shape of y_test_integer:", y_test_integer.shape)

# Manually split the data into training and validation sets
X_train, X_val, y_train_integer, y_val_integer = train_test_split(X_train, y_train_integer, test_size=0.2, random_state=42)

# Create the Neural Network model
nn_model = Sequential()
nn_model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dense(16, activation='relu'))
nn_model.add(Dense(3, activation='softmax'))

# Compile the model
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train, y_train_integer, epochs=2, batch_size=64, validation_data=(X_val, y_val_integer), verbose=0)

# Make predictions on the test set
y_pred_probabilities = nn_model.predict(X_test)
y_pred_nn = np.argmax(y_pred_probabilities, axis=-1)

# Calculate the accuracy of the Neural Network model
accuracy_nn = accuracy_score(y_test_integer, y_pred_nn)
print("Neural Network Accuracy:", accuracy_nn)


## LSTM

In [22]:
%%time
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

keras.utils.set_random_seed(42)

y_train_integer = y_train-1 #.astype('float32')
y_test_integer = y_test-1 #.astype('float32')

# Reshape the input data for LSTM
# TODO: LSTM with a single time step is not a sequence!
time_steps = 1  # Each sample is treated as a single time step
X_train_lstm = X_train.astype('float32').reshape(X_train.shape[0], time_steps, X_train.shape[1])
X_test_lstm = X_test.astype('float32').reshape(X_test.shape[0], time_steps, X_test.shape[1])
# y_train_lstm = y_train_integer.reshape(y_train_integer.shape[0], time_steps, y_train_integer.shape[1])

# Create the Neural Network model
lstm_model = Sequential()
lstm_model.add(Input(shape=(time_steps, X_train_lstm.shape[2])))
lstm_model.add(LSTM(32))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dense(4, activation='softmax'))

# Compile the model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_lstm, y_train_integer, epochs=25, batch_size=64)

# Make predictions on the test set
y_pred_probabilities = lstm_model.predict(X_test_lstm)
y_pred_lstm = np.argmax(y_pred_probabilities, axis=-1)

# Calculate the accuracy of the LSTM model
accuracy_lstm = accuracy_score(y_test_integer, y_pred_lstm)
print("LSTM Accuracy:", accuracy_lstm)

Epoch 1/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 746us/step - accuracy: 0.6123 - loss: 0.9561
Epoch 2/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 758us/step - accuracy: 0.6431 - loss: 0.8707
Epoch 3/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 770us/step - accuracy: 0.6541 - loss: 0.8472
Epoch 4/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 775us/step - accuracy: 0.6590 - loss: 0.8374
Epoch 5/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 775us/step - accuracy: 0.6616 - loss: 0.8317
Epoch 6/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 781us/step - accuracy: 0.6635 - loss: 0.8279
Epoch 7/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 784us/step - accuracy: 0.6646 - loss: 0.8253
Epoch 8/25
[1m8748/8748[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 785us/step - accuracy: 0.6657 - loss: 0.8232
Epoch 9/

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ... (Previous code for reading and preprocessing the data)

# Convert the data into numerical format
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

# Reshape the input data for LSTM
time_steps = 1  # Each sample is treated as a single time step
X_train_lstm = X_train.reshape(X_train.shape[0], time_steps, X_train.shape[1])
X_test_lstm = X_test.reshape(X_test.shape[0], time_steps, X_test.shape[1])

# Create the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(32, input_shape=(time_steps, X_train.shape[1])))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dense(3, activation='sigmoid'))  # Assuming binary classification

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=1)

# Make predictions on the test set
y_pred_lstm = lstm_model.predict(X_test_lstm)
y_pred_lstm = np.round(y_pred_lstm).astype(int).flatten()  # Convert probabilities to binary predictions

# Calculate the accuracy of the LSTM model
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
print("LSTM Accuracy:", accuracy_lstm)
