In [5]:
import pandas as pd
import numpy as np

In [2]:

# Reload the dataset
data = pd.read_excel('open_crosion.xlsx')

# Group data by specimen number and count the number of positive class samples in each specimen
specimen_counts = data.groupby('Specimen')['Label'].sum()

specimen_counts


Specimen
1    1210
2     744
Name: Label, dtype: int64

In [3]:
# Identify specimens with only negative classes
negative_class_specimens = specimen_counts[specimen_counts == 0].index

# Create custom k-fold splits based on the described criteria
positive_class_specimens = [1, 2]
folds = []

# Iterate to create folds
for i, pos_specimen in enumerate(positive_class_specimens):
    fold = [pos_specimen]
    # Add two negative class specimens to the fold
    fold.extend(negative_class_specimens[i*2:i*2+2])
    folds.append(fold)

# Leftover specimens
leftovers = list(set(data['Specimen'].unique()) - set([item for sublist in folds for item in sublist]))

folds, leftovers


([[1], [2]], [])

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Lists to store accuracies and uncertainties
accuracies = []
uncertainties = []

# Split data and train models
for fold in folds:
    # Split data based on the fold
    train_data = data[data['Specimen'].isin(fold)]
    test_data = data[~data['Specimen'].isin(fold)]
    
    # Extract features and labels
    X_train, y_train = train_data[['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6', 'Feature 7']], train_data['Label']
    X_test, y_test = test_data[['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6', 'Feature 7']], test_data['Label']
    
    # Train Random Forest model
    rf.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = rf.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))
    
    # Estimate uncertainty
    tree_predictions = np.array([tree.predict(X_test) for tree in rf.estimators_])
    uncertainty = tree_predictions.std(axis=0).mean()
    uncertainties.append(uncertainty)

accuracies, uncertainties




([0.9839181286549707, 0.9565576673356156],
 [0.05050862563633422, 0.13002766019327647])

In [8]:
from lolopy.learners import RandomForestRegressor
lolo_rf = RandomForestRegressor()
lolo_rf.fit(X_train.values, y_train.values)


In [None]:
predictions, uncertainties = lolo_rf.predict(X_test.values, return_std=True)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier



# Extract features, labels, and coordinates
X = data[['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6', 'Feature 7']]
y = data['Label']
x_coords = data['X-Coordinate'].values
y_coords = data['Y-Coordinate'].values

# Initialize and train the Random Forest model using the entire dataset
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Estimate uncertainty for each data point
tree_predictions = np.array([tree.predict(X) for tree in rf.estimators_])
point_uncertainties = tree_predictions.std(axis=0)

# Plot the uncertainty map
plt.figure(figsize=(12, 8))
plt.scatter(x_coords, y_coords, c=point_uncertainties, cmap='viridis', s=50, alpha=0.6)
plt.colorbar(label='Uncertainty')
plt.xlabel('X-Coordinate')
plt.ylabel('Y-Coordinate')
plt.title('Uncertainty Map based on X and Y Coordinates')
plt.grid(True)
plt.show()
