**Please Note**: As of now, this Jupyter notebook is under active development. Starting from June 5th, 2024, I intend to initiate a series of refinements and expansions. These updates will include additional changes and improvements to enhance the functionality and usability of the notebook. Your patience and understanding during this development phase are greatly appreciated.

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
from math import log2
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.fft import fft, ifft
from scipy.special import erfc


the pre-processing part was based on a previous commit of the public repository of sid-chava [QRNGClassifier Repository](https://github.com/sid-chava/QRNGClassifier)


**QRNG Classifier Preprocessing Functions Improvements**: Starting from June 5th, 2024, I plan to enhance the preprocessing functions of the QRNG Classifier. This could involve:

1. **Refining Feature Extraction**: Improve the methods used to extract features from the raw data. This could involve using more sophisticated techniques or algorithms to better capture the characteristics of the data.


2. **Introducing New Data Transformation Techniques**: Implement new techniques for transforming the data into a format that's more suitable for the classifier. This could involve normalization, scaling, or other transformation methods.

By implementing these improvements, we aim to enhance the effectiveness of the preprocessing functions, which could lead to better performance of the QRNG Classifier.

In [2]:
# File path in Google Drive
file_path = 'AI_2qubits_training_data.txt'

# Read the data from the file
data = []
with open(file_path, 'r') as file:
    for line in file:
        if line.strip():
            binary_number, label = line.strip().split()
            data.append((binary_number, int(label)))

# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['binary_number', 'label'])

num_concats = 4

new_df = pd.DataFrame({'Concatenated_Data': [''] * (len(df) // num_concats), 'label': [''] * (len(df) // num_concats)})

# Loop through each group of 10 rows and concatenate their 'Data' strings
for i in range(0, len(df), num_concats):
    new_df.iloc[i // num_concats, 0] = ''.join(df['binary_number'][i:i+num_concats])
    new_df.iloc[i // num_concats, 1] = df['label'][i]

# Calculate Shannon entropy for each concatenated binary sequence
def calculate_2bit_shannon_entropy(binary_string):
    # Ensure the string length is a multiple of 2 for exact 2-bit grouping
    if len(binary_string) % 4 != 0:
        raise ValueError("Binary string length must be a multiple of 2.")
    
    # Define possible 2-bit combinations
    #patterns = ['0000', '1000', '1100', '1110', '1111', '0100', '0110', '0111', '0010', '0011', '0001', '1001', '1101', '0110', '0101', '1010']
    patterns = ['00', '10', '11', '01']
    frequency = {pattern: 0 for pattern in patterns}
    
    # Count frequency of each pattern
    for i in range(0, len(binary_string), 2):
        segment = binary_string[i:i+2]
        if segment in patterns:
            frequency[segment] += 1
    
    # Calculate total segments counted
    total_segments = sum(frequency.values())
    
    # Calculate probabilities and entropy
    entropy = 0
    for count in frequency.values():
        if count > 0:
            probability = count / total_segments
            entropy -= probability * log2(probability)
    
    return entropy

def classic_spectral_test(bit_string):
    bit_array = 2 * np.array([int(bit) for bit in bit_string]) - 1
    dft = fft(bit_array)
    n_half = len(bit_string) // 2 + 1
    mod_dft = np.abs(dft[:n_half])
    threshold = np.sqrt(np.log(1 / 0.05) / len(bit_string))
    peaks_below_threshold = np.sum(mod_dft < threshold)
    expected_peaks = 0.95 * n_half
    d = (peaks_below_threshold - expected_peaks) / np.sqrt(len(bit_string) * 0.95 * 0.05)
    p_value = erfc(np.abs(d) / np.sqrt(2)) / 2
    return p_value

# Apply the entropy calculationnew_df['shannon_entropy'] = new_df['Concatenated_Data'].apply(calculate_2bit_shannon_entropy)

new_df['shannon_entropy'] = new_df['Concatenated_Data'].apply(calculate_2bit_shannon_entropy)
new_df['spectral_test'] = new_df['Concatenated_Data'].apply(classic_spectral_test)

df_features = pd.DataFrame(new_df['Concatenated_Data'].apply(list).tolist())
new_df = pd.concat([new_df.drop(columns='Concatenated_Data'), df_features], axis=1)

#print(df)

#print(df.head(10))


# Split the data into features (X) and labels (y)
X = new_df.drop(columns='label').values
#print(X)
y = new_df['label'].values
y=y.astype('int')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

in this notebbok, we tried the approach of joining 4 sequences.

In [3]:
new_df["label"].value_counts()

label
4    2000
1     500
2     500
3     500
Name: count, dtype: int64

In [4]:
new_df

Unnamed: 0,label,shannon_entropy,spectral_test,0,1,2,3,4,5,6,...,390,391,392,393,394,395,396,397,398,399
0,1,1.953850,0.0,0,1,0,0,1,1,1,...,1,1,0,1,1,1,1,1,0,1
1,1,1.982907,0.0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,1,1,1,0
2,1,1.957665,0.0,1,0,1,0,0,0,0,...,1,1,1,1,0,0,1,0,1,1
3,1,1.981947,0.0,1,0,1,1,0,0,1,...,1,1,1,0,0,1,0,0,0,0
4,1,1.957641,0.0,1,0,0,1,0,0,1,...,1,0,1,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,4,1.862790,0.0,1,1,1,0,0,1,1,...,1,0,0,0,1,0,1,1,1,1
3496,4,1.911613,0.0,0,0,1,1,1,0,0,...,0,1,0,1,0,1,1,1,0,0
3497,4,1.940077,0.0,1,1,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
3498,4,1.930763,0.0,1,1,1,1,0,0,1,...,0,0,1,1,0,0,1,1,0,0


In [5]:
import numpy as np

# Assuming y_train is a numpy array
values, counts = np.unique(y_train, return_counts=True)
for value, count in zip(values, counts):
    print(f"Value: {value}, Count: {count}")

Value: 1, Count: 374
Value: 2, Count: 395
Value: 3, Count: 405
Value: 4, Count: 1626


In [6]:
from imblearn.over_sampling import RandomOverSampler

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)

# Now, you can use X_over and y_over to train your model

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy




vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_over)
min_entropy_test = vectorized_entropy(X_test)

X_over_with_entropy = np.column_stack((X_over, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))
# Create the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_over_with_entropy, y_over)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_with_entropy)

# Calculate the accuracy of the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)


Random Forest Accuracy: 0.5585714285714286


In [8]:
# most important features
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_over_with_entropy.shape[1]):
    print(f"{f + 1}. feature {indices[f]} ({importances[indices[f]]})")

# Select the top 10 features

top_10_features = indices[:10]

# Train the model with the top 10 features
class_weights = {1: 0.5, 2: 0.2, 3: 0.2, 4: 0.1}


rf_model_top_10 = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Train the model

rf_model_top_10.fit(X_over_with_entropy[:, top_10_features], y_over)

# Make predictions on the test set


y_pred_rf_top_10 = rf_model_top_10.predict(X_test_with_entropy[:, top_10_features])

# Calculate the accuracy of the Random Forest model with the top 10 features

accuracy_rf_top_10 = accuracy_score(y_test, y_pred_rf_top_10)

print("Random Forest Accuracy with Top 10 Features:", accuracy_rf_top_10)


# cross validation score

from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation on the Random Forest model with the top 10 features

cv_scores = cross_val_score(rf_model_top_10, X_test_with_entropy[:, top_10_features], y_test, cv=5)

print("Cross-Validation Scores:", cv_scores)

# Calculate the mean cross-validation score

mean_cv_score = np.mean(cv_scores)

print("Mean Cross-Validation Score:", mean_cv_score)





Feature ranking:
1. feature 0 (0.08417361516844546)
2. feature 402 (0.06276002662013498)
3. feature 327 (0.00421214990667557)
4. feature 251 (0.004192491751805198)
5. feature 165 (0.0038018692064021907)
6. feature 35 (0.0034800235293270675)
7. feature 111 (0.003391084922748429)
8. feature 383 (0.0033394795208258722)
9. feature 287 (0.003258439440553982)
10. feature 357 (0.00323974001799086)
11. feature 393 (0.003211066668065588)
12. feature 317 (0.003187722666689175)
13. feature 55 (0.002971570905016892)
14. feature 325 (0.002897786685324477)
15. feature 265 (0.0028422458685201193)
16. feature 61 (0.0027955545968050247)
17. feature 71 (0.0027911505477893317)
18. feature 257 (0.002789803790923472)
19. feature 263 (0.0027618253788747873)
20. feature 315 (0.002756535973231819)
21. feature 229 (0.0027285326177937824)
22. feature 11 (0.002728019225822083)
23. feature 163 (0.0027243812152641946)
24. feature 299 (0.0027187501226598556)
25. feature 305 (0.0026883443764413794)
26. feature 377 (

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Define your class weights
# class_weights = {1: 0.5, 2: 0.2, 3: 0.2, 4: 0.1}

# Initialize a list to store the results
results = []
vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))


# Loop over the desired range of feature counts
for num_features in range(5, 51):
    # Select the top features
    top_features = indices[:num_features]

    # Train the model with the top features and class weights
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_with_entropy[:, top_features], y_train)

    # Make predictions on the test set
    y_pred_rf = rf_model.predict(X_test_with_entropy[:, top_features])

    # Calculate the accuracy of the model
    accuracy_rf = accuracy_score(y_test, y_pred_rf)

    # Store the number of features and the accuracy in the results list
    results.append((num_features, accuracy_rf))
    # print(num_features)
# Sort the results by accuracy in descending order
results.sort(key=lambda x: x[1], reverse=True)

# Print the top 10 results
for i in range(10):
    print(f"Number of features: {results[i][0]}, Accuracy: {results[i][1]}")

Number of features: 9, Accuracy: 0.7757142857142857
Number of features: 18, Accuracy: 0.7642857142857142
Number of features: 8, Accuracy: 0.7614285714285715
Number of features: 10, Accuracy: 0.76
Number of features: 16, Accuracy: 0.76
Number of features: 20, Accuracy: 0.76
Number of features: 43, Accuracy: 0.76
Number of features: 14, Accuracy: 0.7585714285714286
Number of features: 7, Accuracy: 0.7571428571428571
Number of features: 17, Accuracy: 0.7571428571428571


In [10]:
# class_weights = {1: 20, 2: 20, 3: 20, 4: 1}

from sklearn.utils import compute_class_weight


top_10_features = indices[:16]


rf_model_top_10 = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Train the model

rf_model_top_10.fit(X_train_with_entropy[:, top_10_features], y_train)

# Make predictions on the test set


y_pred_rf_top_10 = rf_model_top_10.predict(X_test_with_entropy[:, top_10_features])

# Calculate the accuracy of the Random Forest model with the top 10 features

accuracy_rf_top_10 = accuracy_score(y_test, y_pred_rf_top_10)

print("Random Forest Accuracy with Top 10 Features:", accuracy_rf_top_10)


# cross validation score

from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation on the Random Forest model with the top 10 features

cv_scores = cross_val_score(rf_model_top_10, X_train_with_entropy[:, top_10_features], y_train, cv=10)

print("Cross-Validation Scores:", cv_scores)

# Calculate the mean cross-validation score

mean_cv_score = np.mean(cv_scores)

print("Mean Cross-Validation Score:", mean_cv_score)

# confusion matrix

from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix

conf_matrix = confusion_matrix(y_test, y_pred_rf_top_10)


print("Confusion Matrix:")
print(conf_matrix)

Random Forest Accuracy with Top 10 Features: 0.7514285714285714
Cross-Validation Scores: [0.76785714 0.79642857 0.78214286 0.78571429 0.76785714 0.75714286
 0.78571429 0.73571429 0.8        0.82857143]
Mean Cross-Validation Score: 0.7807142857142857
Confusion Matrix:
[[ 30  39  20  37]
 [ 27  56   2  20]
 [  4   0  83   8]
 [  5   6   6 357]]


In [11]:
from imblearn.over_sampling import RandomOverSampler

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)

# Now, you can use X_over and y_over to train your model

In [12]:
import numpy as np

# Assuming y_train is a numpy array
values, counts = np.unique(y_over, return_counts=True)
for value, count in zip(values, counts):
    print(f"Value: {value}, Count: {count}")

Value: 1, Count: 1626
Value: 2, Count: 395
Value: 3, Count: 405
Value: 4, Count: 1626


In [13]:
from sklearn.ensemble import RandomForestClassifier

def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy

X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)


vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))

from sklearn.utils import compute_class_weight


top_10_features = indices[:13]


rf_model_top_10 = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Train the model

rf_model_top_10.fit(X_train_with_entropy[:, top_10_features], y_train)

# Make predictions on the test set


y_pred_rf_top_10 = rf_model_top_10.predict(X_test_with_entropy[:, top_10_features])

# Calculate the accuracy of the Random Forest model with the top 10 features

accuracy_rf_top_10 = accuracy_score(y_test, y_pred_rf_top_10)

print("Random Forest Accuracy with Top 10 Features:", accuracy_rf_top_10)


conf_matrix = confusion_matrix(y_test, y_pred_rf_top_10)


print("Confusion Matrix:")
print(conf_matrix)

Random Forest Accuracy with Top 10 Features: 0.8594327990135635
Confusion Matrix:
[[312   0   1   2]
 [ 37  29   3  20]
 [ 12   0  58   6]
 [ 17   6  10 298]]


In [14]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(rf_model_top_10, X_train_with_entropy[:, top_10_features], y_train, cv=10)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

# Check accuracy on the training dataset
train_accuracy = rf_model_top_10.score(X_train_with_entropy[:, top_10_features], y_train)
print("Training accuracy:", train_accuracy)

# Check accuracy on the test dataset
test_accuracy = rf_model_top_10.score(X_test_with_entropy[:, top_10_features], y_test)
print("Test accuracy:", test_accuracy)

Cross-validation scores: [0.83384615 0.85493827 0.88888889 0.88888889 0.87037037 0.84567901
 0.85802469 0.84876543 0.85185185 0.86728395]
Mean cross-validation score: 0.8608537511870844
Training accuracy: 1.0
Test accuracy: 0.8594327990135635


In [15]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=12)
gb_model.fit(X_train_with_entropy[:, top_10_features], y_train)
y_pred_gb = gb_model.predict(X_test_with_entropy[:, top_10_features])
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy:", accuracy_gb)

# AdaBoost
ab_model = AdaBoostClassifier(random_state=12)
ab_model.fit(X_train_with_entropy[:, top_10_features], y_train)
y_pred_ab = ab_model.predict(X_test_with_entropy[:, top_10_features])
accuracy_ab = accuracy_score(y_test, y_pred_ab)
print("AdaBoost Accuracy:", accuracy_ab)

Gradient Boosting Accuracy: 0.7977805178791615




AdaBoost Accuracy: 0.6831072749691739


In [16]:
ab_model = RandomForestClassifier(random_state=12)
ab_model.fit(X_train_with_entropy[:, top_10_features], y_train)
y_pred_ab = ab_model.predict(X_test_with_entropy[:, top_10_features])
accuracy_ab = accuracy_score(y_test, y_pred_ab)
print("RNF Accuracy:", accuracy_ab)

RNF Accuracy: 0.8631319358816276


In [17]:
from imblearn.over_sampling import RandomOverSampler

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='auto')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)

# Now, you can use X_over and y_over to train your model

In [18]:
import numpy as np

# Assuming y_train is a numpy array
values, counts = np.unique(y_over, return_counts=True)
for value, count in zip(values, counts):
    print(f"Value: {value}, Count: {count}")

Value: 1, Count: 1311
Value: 2, Count: 1311
Value: 3, Count: 1311
Value: 4, Count: 1311


In [19]:
from sklearn.ensemble import RandomForestClassifier

def calculate_min_entropy(sequence):
    sequence = np.asarray(sequence, dtype=float)  # Convert sequence to float
    p = np.mean(sequence)  # Proportion of ones
    max_prob = max(p, 1 - p)
    if max_prob == 0:  # Handle the case where all bits are the same
        return 0
    min_entropy = -np.log2(max_prob)
    return min_entropy

X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)


vectorized_entropy = np.vectorize(calculate_min_entropy, signature='(n)->()')

# Calculate min-entropy for each sequence in the training and testing datasets
min_entropy_train = vectorized_entropy(X_train)
min_entropy_test = vectorized_entropy(X_test)

X_train_with_entropy = np.column_stack((X_train, min_entropy_train))
X_test_with_entropy = np.column_stack((X_test, min_entropy_test))

from sklearn.utils import compute_class_weight


top_10_features = indices[:13]


rf_model_top_10 = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Train the model

rf_model_top_10.fit(X_train_with_entropy[:, top_10_features], y_train)

# Make predictions on the test set


y_pred_rf_top_10 = rf_model_top_10.predict(X_test_with_entropy[:, top_10_features])

# Calculate the accuracy of the Random Forest model with the top 10 features

accuracy_rf_top_10 = accuracy_score(y_test, y_pred_rf_top_10)

print("Random Forest Accuracy with Top 10 Features:", accuracy_rf_top_10)


conf_matrix = confusion_matrix(y_test, y_pred_rf_top_10)


print("Confusion Matrix:")
print(conf_matrix)

Random Forest Accuracy with Top 10 Features: 0.9618684461391802
Confusion Matrix:
[[257   2   1   4]
 [  3 270   0   0]
 [  2   3 261   0]
 [  8  11   6 221]]


In [20]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(rf_model_top_10, X_train_with_entropy[:, top_10_features], y_train, cv=10)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

# Check accuracy on the training dataset
train_accuracy = rf_model_top_10.score(X_train_with_entropy[:, top_10_features], y_train)
print("Training accuracy:", train_accuracy)

# Check accuracy on the test dataset
test_accuracy = rf_model_top_10.score(X_test_with_entropy[:, top_10_features], y_test)
print("Test accuracy:", test_accuracy)

Cross-validation scores: [0.9547619  0.9452381  0.94285714 0.95238095 0.95714286 0.93794749
 0.94988067 0.96420048 0.95465394 0.95942721]
Mean cross-validation score: 0.9518490737583816
Training accuracy: 1.0
Test accuracy: 0.9618684461391802


test accuracy 95~96%