In [1]:
from lester.context import datasource, prepare, split, encode_features, encode_target, model_training, run

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder

from sentence_transformers import SentenceTransformer

@prepare(
    books=datasource('books', track_provenance=True), 
    categories=datasource('categories', track_provenance=True),
    tags=datasource('tags'),  
    book_tags=datasource('book_tags'))    
def labeled_books(books, categories, tags, book_tags):

    english_books = books\
        .dropna()\
        .query("language_code == 'eng'")    

    popular_categories = categories.query("popularity >= 90")
    
    categories_with_names = popular_categories.merge(tags, on='tag_id')
    categories_with_names = categories_with_names[['tag_id', 'tag_name']]
    
    categories_with_books = categories_with_names.merge(book_tags, on='tag_id')    
    labeled_books = english_books\
        .merge(categories_with_books, on='goodreads_book_id')\
        .query("average_rating > 4.3")

    return labeled_books


@split()
def stratified_split(data, random_seed):
    return train_test_split(data, test_size=0.2, random_state=random_seed)




@encode_features()
def title_embeddings():
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    def embed(column_slice):
        texts = [' '.join(column) for column in column_slice.values]
        return embedding_model.encode(texts)
        
    return ColumnTransformer(transformers=[
        ('embeddings', FunctionTransformer(embed), ['title']),
    ])


@encode_target(target_column='tag_name')
def encode_book_category():
    return LabelEncoder()


@model_training()
def create_rag_model(features, labels, training_data):
    import chromadb

    client = chromadb.Client()
    collection = client.create_collection("labeled-books")

    print(features.shape)
    
    documents = list(training_data['title'])
    embeddings = [features[row_index,:] for row_index in range(0, len(training_data))]
    metadatas = [{'category': tag} for tag in training_data['tag_name']]
    ids = [str(id) for id in range(0, len(training_data))]
    
    collection.add(
        documents=documents,
        embeddings=features,
        metadatas=metadatas,
        ids=ids,
    )

    


run()    

(2452, 384)


In [None]:
class RAGClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.is_fitted_ = False
    
    def fit(self, X, y):
        self.is_fitted_ = True        
        return self

    def predict(self, X):
        # Predict the majority class for each instance
        if self.majority_class_ is None:
            raise NotFittedError("This MajorityClassClassifier instance is not fitted yet.")
        return np.full(shape=(len(X),), fill_value=self.majority_class_, dtype=np.int)