In [5]:
from qdrant_client import QdrantClient


def add_svm_documents(client):
    with open('svm.txt', 'r', encoding='utf-8') as file:
        documents = file.readlines()
    documents = [doc.strip() for doc in documents]
    client.add(collection_name="knowledge-base", documents=documents)

# def add_decision_tree_documents(client):
#     with open('decision_tree.txt', 'r', encoding='utf-8') as file:
#         documents = file.readlines()
#     documents = [doc.strip() for doc in documents]
#     client.add(collection_name="knowledge-base", documents=documents)

def add_decision_tree_documents(client):
    with open('decision_tree.txt', 'r', encoding='utf-8') as file:
        documents = file.readlines()
    documents = [doc.strip() for doc in documents]
    client.add(collection_name="knowledge-base", documents=documents)

def add_sklearn_tree_documents(client):
    with open('sklearn.tree.txt', 'r', encoding='utf-8') as file:
        documents = file.readlines()
    documents = [doc.strip() for doc in documents]
    client.add(collection_name="knowledge-base", documents=documents)

client = QdrantClient(":memory:")

user_input = input("Enter the model you want to add documents from (decision_tree, sklearn_tree, etc.): ")

if user_input.lower() == 'decision_tree':
    add_decision_tree_documents(client)
elif user_input.lower() == 'sklearn_tree':
    add_sklearn_tree_documents(client)
else:
    add_svm_documents(client)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
import os
os.environ["OPENAI_API_KEY"] = "####"
from openai import OpenAI
ai = OpenAI()
def rag(chat_history: list[str], question: str, n_points: int = 3) -> str:
    results = client.query(
        collection_name="knowledge-base",
        query_text=question,
        limit=n_points,
    )
    
    context = "\n".join(r.document for r in results)
    metaprompt = f"""
    You are a helpful machine learning bot.
    Answer the following question using the provided context.
    If the user asks you to build a model : If the parameters are not defined by the user, ask them to specify, a sample json file that is to be to made to run
    the model looks like this:
    [
    
    "filename" : "breast-cancer-wisconsin.csv",
    "model_name" : "decision_tree",
    "param": [
        "kernel": "linear"
    ],
    "target_variable": "Class",
    "split" : 0.2
    ] 

    if everything is mentioned return the json file
    if user says done return -1
    Always make sure you are checking this first before giving any response. 
    Also refer to the chat history while answering a question. consider info given by the assisstant only as truth.
    The context provided is only documentation for referring information. When asked direct questions about model building remember answers in the chat history and when asked factual question about the docs explicitly, refer to the context documentation. 
    If you can't find the answer, do not pretend you know it, but answer "I don't know".
    If you have limited information on something, state is and then answer "this is all I know." irrespective of how much their word count expectation is.
    
    Question: {question}
    CHAT HISTORY : {chat_history}
   Context:
    {context.strip()}

    Answer:
    """

    results = ai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful machine learning bot."},
            {"role": "user", "content": metaprompt},
        ],
    )

    return results

In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import json
def to_numeric(F6):
    if F6 == "?":
        return np.nan
    else:
        return int(F6)

def file_preprocess(filename):
  df = pd.read_csv(filename)
#   df['F6'] = df['F6'].apply(to_numeric)
#   mean_F6 = df['F6'].mean()
#   df['F6'] = df['F6'].fillna(mean_F6)
  return df
def runner(json_file, model_file):
    # Load parameters from JSON
    with open(json_file, 'r') as file:
        parameters = json.load(file)
    with open(model_file, 'r') as file:
        model_param = json.load(file)

    model_name = parameters['model_name']
    df = file_preprocess(parameters['filename'])

    #Check for target_variable is present or not
    target_variable = parameters.get("target_variable", None)
    if target_variable is None:
      raise ValueError("Target variable not specified in the parameters.")

    X = df.drop(columns=[target_variable])
    y = df[target_variable]

  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=parameters['split'], random_state=42)

    def_param = {
      "decision_tree" : {"param_dict" : "default_decision_tree_parameters", "lib_name" : "DecisionTreeClassifier"},
      "svm" : {"param_dict" : "default_svm_parameters", "lib_name" : "SVC"},
      "lr" : {"param_dict" : "default_lr_parameters", "lib_name" : "LogisticRegression"}
    }

    params= def_param[model_name]["param_dict"]
    param = model_param[params]
    lib_name = def_param[model_name]["lib_name"]
  # print(param, lib_name)

  # Merge default and user-provided parameters
    merged_parameters = {**eval(str(param)), **parameters.get("param", {})}
  # print(merged_parameters)

  # Initialize the Decision Tree model with the merged parameters
    model = eval(lib_name)(**merged_parameters)

  # Train the Decision Tree model
    model.fit(X_train, y_train)
    para= model.get_params()
  # Make predictions on the test set
    y_pred = model.predict(X_test)

  # Evaluate the model
    #accuracy = accuracy_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
#     print(f"Accuracy: {acc}")
    cr = classification_report(y_test,y_pred)
#     print(cr)
    cf_matrix = confusion_matrix(y_test, y_pred)
#     print(cf_matrix)
    
    mmy_dict = {"accuracy" : str(acc), "class_report" : str(cr), "conf_mat" : str(cf_matrix),"paramters" : str(para)}
    print(mmy_dict)
    return mmy_dict
    #print(f"Accuracy: {accuracy}")

    #return accuracy
    #print(merged_parameters)
  # print(param)
  # print(eval(param))
  # print(parameters["param"])
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = ai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response
def make_json(chat_history1):
    prompt = f"""
    read the chat history between the user and the chatbot and create a dictionary of the model parameters finalized by them. include filename and append the dataset filename with .csv extension. create  a dictionary named param (which would be the parameters of the model) and write all the parameters asked by the user in the datatype of what the function requires. 
    include 'target_variable' as mentioned in the prompt. and 'split' should be 0.2 unless some other value is specified. an example for svm model might look like this (with curly brackets instead of sqauare brackets) also make sure you write the model name compatible to the sklearn libraries:
    [
    
    "filename" : "<file_name>.csv",
    "model_name" : "decision_tree",
    "param": [
        "max_depth": 3
    ],
    "target_variable": "Class",
    "split" : <0.2 or 0.3>
    ]
    text = ```{chat_history1}```
    """
    json_objects = get_completion(prompt)
    # json_objects = rag(chat_history1, 'return the json object')
    json_object = json_objects.choices[0].message.content

    print(json_object)
    data_dict = json.loads(json_object)
    json_file_path = "sample.json"
    with open(json_file_path, 'w') as json_file:
        json.dump(data_dict, json_file,indent=2)
    result = runner("sample.json","model_parameters.json")
    return result


In [9]:
from IPython.display import display
import ipywidgets as widgets
chat_history = []

def on_submit(_):
    query = input_box.value
    

    if query.lower() == 'exit':
        print("Thank you for using the State of the Union chatbot!")
        return
    # if query.lower() == 'done':

    #     human_tex = chat_history
    #     eval_metrics= make_json(human_tex)
    #     chat_history.append(('what is the accuracy?', f'The resulting accuracy is {eval_metrics}'))
    #     # chat_history.append(('what is the resulting accuracy?', 'the resulting accuracy is '+str(eval_metrics)))
    #     return
    
    response = rag(chat_history, query)
    result = response.choices[0].message.content
    if result == '-1':
        chat_history.append((query, 'building model'))
        # chat_history.append(('what is the resulting accuracy?', 'the resulting accuracy is '+str(eval_metrics)))
        human_tex = chat_history
        eval_metrics= make_json(human_tex)
        # chat_history.append(('what is the accuracy?', f'The resulting accuracy is {eval_metrics}'))
        result = 'The following are the model params and evaluation metrics ' + str(eval_metrics)
        # chat_history.append(('what is the resulting accuracy?', 'the resulting accuracy is '+str(eval_metrics)))

    chat_history.append((query, result))

    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result}'))
    input_box.value = ""

print("Welcome to the Transformers chatbot! Type done when you want run the model. Type 'exit' to stop.")

input_box = widgets.Text(placeholder='Please enter your question:')
input_box.on_submit(on_submit)

display(input_box)

Welcome to the Transformers chatbot! Type done when you want run the model. Type 'exit' to stop.


  input_box.on_submit(on_submit)


Text(value='', placeholder='Please enter your question:')

HTML(value='<b>User:</b> what is gini index?')

HTML(value='<b><font color="blue">Chatbot:</font></b> The Gini index is a metric used in decision tree models …

HTML(value='<b>User:</b> build a decision_tree model for iris.csv dataset where target variable is Species')

HTML(value='<b><font color="blue">Chatbot:</font></b> To build a decision_tree model for the iris.csv dataset …

{
    "filename": "iris.csv",
    "model_name": "decision_tree",
    "param": {
        "criterion": "gini"
    },
    "target_variable": "Species",
    "split": 0.2
}
{'accuracy': '1.0', 'class_report': '                 precision    recall  f1-score   support\n\n    Iris-setosa       1.00      1.00      1.00        10\nIris-versicolor       1.00      1.00      1.00         9\n Iris-virginica       1.00      1.00      1.00        11\n\n       accuracy                           1.00        30\n      macro avg       1.00      1.00      1.00        30\n   weighted avg       1.00      1.00      1.00        30\n', 'conf_mat': '[[10  0  0]\n [ 0  9  0]\n [ 0  0 11]]', 'paramters': "{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}"}


HTML(value='<b>User:</b> please return -1')

HTML(value='<b><font color="blue">Chatbot:</font></b> The following are the model params and evaluation metric…

In [6]:
model = LogisticRegression()
model.get_params

<bound method BaseEstimator.get_params of LogisticRegression()>

In [7]:
chat_history

[('what is logisticregression',
  'Logistic regression is a type of supervised learning algorithm used for binary classification. It is commonly used when the dependent variable is categorical. The logistic regression model predicts the probability of the occurrence of the event, given the independent variables. The output of logistic regression is a binary outcome, meaning it predicts the probability of the event belonging to one of the two classes. It uses the logistic function to model the relationship between the independent variables and the dependent variable. The logistic regression model can be fitted on a training dataset, and then used to predict the probabilities of the classes for new data points.'),
 ('what are its parameters',
  'The parameters of the given model are as follows:\n\n- "filename": Specifies the name of the CSV file to be used for training the model.\n- "model_name": Specifies the type of model to be built, which in this case is "decision_tree".\n- "param": 

In [17]:
chat_history

[('what is entropy?',
  'Entropy is a measure of impurity in a set of data. In the context of machine learning, entropy is used as a criterion to measure the quality of a split when constructing decision trees. It quantifies the amount of uncertainty or randomness in the data.'),
 ('build a decision tree model with that for breast-cancer-wisconsin dataset, the target variable is Class',
  'building model'),
 ('build a decision tree model with that for breast-cancer-wisconsin dataset, the target variable is Class',
  'The resulting accuracy is 0.9571428571428572'),
 ('maybe build the model again but this time with log loss criterion',
  'building model'),
 ('maybe build the model again but this time with log loss criterion',
  'The resulting accuracy is 0.9642857142857143'),
 ('now again build that model with gini index', 'building model'),
 ('now again build that model with gini index',
  'The resulting accuracy is 0.9642857142857143'),
 ('can you enlist all the criterea with their cor