In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Connect to database server
client = MongoClient()

# Get database
db = client.BERTHA

# Get collection
collection = db.experiments

In [3]:
df = pd.DataFrame(list(collection.find()))
params = ['Power_Ax1_setpoint_[W]', 'Power_Ax2_setpoint_[W]', 'Power_Ax3_setpoint_[W]']
X = df[params].to_numpy()
target = ['Settled']
y = df[target].to_numpy()

In [4]:
'''
TO CHECK WHEN IMPLEMENTING IN LAB
---------------------------------

- Make sure that ObjectID field type isn't an issue when reading from database. 
In our datafile, the prefix "$" before oid (objectid) caused a parsing issue, so I had to
manually remove it. However, the data came from a JSON dump of MongoDB. I suspect this won't
be an issue if we read from the database via PyMongo directly.
'''

class ActiveLearnerBERTHA:
    '''
    The active learning class made specifically for the BERTHA setup. 
    This class augments the active learner from ModAL. It learns from 
    data taken from the database used by BERTHA, and it stores the data
    that it has learned from to generate new queries in a pool-based
    setting. 
    '''
    
    def __init__(self, 
                 mongodb_collection, 
                 input_args : list, 
                 input_ranges : list, 
                 target : list
                ):
        '''
        Params
        ------
        mongodb_collection: the MongoDB collection from the database to train from
        input_args: list of parameters from database to train on, example: [pressure, power1, power2]
        input_ranges: list of ranges for each input parameter, example: [50, 70, 120].
        Note that input_args and input_ranges must map 1:1. This is also error checked by 
        the constructor.
        target: list the classifier target. example: ["Settled"]
        '''
        
        self.collection = mongodb_collection
        self.input_args = input_args
        self.input_ranges = input_ranges
        self.target = target
        self.X = np.empty((0,len(input_args)), int)
        self.y = np.empty((0,len(target)), int)
        
        assert(len(input_args) == len(input_ranges))
        
        self.learner = ActiveLearner(
        estimator=RandomForestClassifier(),
        query_strategy=uncertainty_sampling
        )
        

    def learn_from_db(self, mongodb_query : dict = {}, discard_old_data : bool = False) -> None:
        '''
        Trains the active learner from the available data in the database.
        
        Params
        ------
        mongodb_query : filters data according to the MongoDB query language 
        (https://www.mongodb.com/docs/manual/tutorial/query-documents/).
        Defaults to no query.
        discard_old_data : retrains the active learner on only the newly selected
        data from the database.
        '''
        
        # Filter data from the database based on the MongoDB_query parameter
        df = pd.DataFrame(list(collection.find(mongodb_query)))
        
        # Train the active learner on the data from the database
        self.X = df[self.input_args].to_numpy()
        self.y = df[self.target].to_numpy().ravel()
        self.learner.teach(X=self.X, y=self.y, only_new = discard_old_data)
        

    def get_process_params(self, pool_size=500) -> np.ndarray:
        '''
        Query the active learner for the next set of most informative
        experiment parameters.
        
        Params
        ------
        pool_size: amount of randomly generated datapoints to evaluate
        in the pool-based setting. The higher the number, the greater the
        information gain; however, this comes at the cost of higher
        computation.
        
        Returns
        -------
        A numpy array of the most informative set of process parameters 
        from the generated pool for the next experiment experiment.
        '''
        
        # Generate pool of data
        pool = np.random.randint(self.input_ranges, size=(pool_size, len(self.input_ranges)))
        
        # Remove elements from pool already in the training data (set difference: pool - self.X)
        pool = pool[~((pool[:,None,:] == self.X).all(-1)).any(1)]
        
        # Query the active learner
        query_index, query_parameters = self.learner.query(pool)
        
        return query_parameters
    

In [5]:
alb = ActiveLearnerBERTHA(collection, params, [80,80,80], target)

In [6]:
alb.learn_from_db()

In [7]:
alb.get_process_params()

array([[28, 36, 71]])

## Example query: filter for experiments where W_2 is activated (>0)

In [8]:
pd.DataFrame(list(collection.find({'Power_Ax2_setpoint_[W]': {"$gt": 0}})))

Unnamed: 0,_id,Date,Campaign code,Campaign description,Material Ax1,Material Ax2,Material Ax3,Magnetron Ax1,Magnetron Ax2,Magnetron Ax3,...,Voltage_Ax1_[V] Mean,Voltage_Ax1_[V] STD,Actual_Power_Ax2_[W] Mean,Actual_Power_Ax2_[W] STD,Voltage_Ax2_[V] Mean,Voltage_Ax2_[V] STD,Actual_Power_Ax3_[W] Mean,Actual_Power_Ax3_[W] STD,Voltage_Ax3_[V] Mean,Voltage_Ax3_[V] STD
0,{'_oid': '65cf6db238a7b0a510a578fc'},02/16/2024,BaZrSn_001,"Sputter setup with Ba, Zr and Sn for developme...",Ba,Zr,Sn,5,1,3,...,0,0,20.817,0.563,211.104,0.603,36.736,1.843,283.175,0.657
1,{'_oid': '65cf6de938a7b0a510a578fd'},02/16/2024,BaZrSn_001,"Sputter setup with Ba, Zr and Sn for developme...",Ba,Zr,Sn,5,1,3,...,0,0,67.924,0.211,217.777,0.111,44.213,2.47,285.515,0.419


In [18]:
alb.learn_from_db({'Power_Ax2_setpoint_[W]': {"$gt": 0}}, discard_old_data = True)

# Try BERTHA-AL on synthetic data

In [24]:
collection = db.synthetic

In [25]:
testdf = pd.DataFrame(list(collection.find()))

In [26]:
testdf

Unnamed: 0,_id,x,y,class
0,65f71127c0dc1f4ed5c3ad92,425.297302,484.231998,1
1,65f71127c0dc1f4ed5c3ad93,485.613467,188.208158,1
2,65f71127c0dc1f4ed5c3ad94,416.568741,341.844643,1
3,65f71127c0dc1f4ed5c3ad95,430.802431,92.321342,1
4,65f71127c0dc1f4ed5c3ad96,454.115796,213.044412,1
...,...,...,...,...
995,65f71127c0dc1f4ed5c3b175,409.276855,304.342965,1
996,65f71127c0dc1f4ed5c3b176,324.098892,356.365318,1
997,65f71127c0dc1f4ed5c3b177,410.244711,219.213596,1
998,65f71127c0dc1f4ed5c3b178,322.777218,270.782272,1


In [27]:
params = ['x', 'y']
target = ['class']
alb = ActiveLearnerBERTHA(collection, params, [500,500], target)

In [28]:
alb.learn_from_db()

In [29]:
alb.get_process_params()

array([[ 21, 286]])

# Misc

In [10]:
class Test:
    
    def set_smth(self, arg):
        self.arg = arg

In [11]:
t = Test()

In [12]:
t.set_smth('hi')

In [13]:
t.arg

'hi'

## Generate pool

In [14]:
bounds = [5, 120, 2400]
n = 10
np.random.randint(bounds, size=(n, 3))

array([[   2,  118,  425],
       [   4,   31, 1581],
       [   0,   49, 2044],
       [   0,   39, 1366],
       [   1,  113,   39],
       [   0,  107, 1405],
       [   3,   14,  605],
       [   1,    6, 2059],
       [   2,   35,  818],
       [   3,   97,  843]])

In [15]:
A = np.array([[1,4,3],[2,5,1],[7,8,9]])
B = np.array([[1,4,3],[3,6,1],[7,8,9]])

A = A[~((A[:,None,:] == B).all(-1)).any(1)]

In [16]:
type(A)

numpy.ndarray