In [1]:
import json
import os

import chromadb

import autogen
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent

from autogen.retrieve_utils import TEXT_FORMATS

config_list = [ #önbelleğe aktarmak için yardımcı olacaktır.
    {
        "model": "llama3",
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
    }
]

assert len(config_list) > 0
print("models to use: ", [config_list[i]["model"] for i in range(len(config_list))])

models to use:  ['llama3']


In [2]:
print("Accepted file formats for `docs_path`:")
print(TEXT_FORMATS)

Accepted file formats for `docs_path`:
['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']


In [3]:
assistant = RetrieveAssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={
        "timeout": 600,
        "cache_seed": 42,
        "config_list": config_list,
    },
)

ragproxyagent = RetrieveUserProxyAgent( #autogende bulunan insana benzer şekilde davranan sınıftır. Söz konusu temsilci yani agenttan gelen kodun yürütülmesinden ve herhangi bir işlemin gerçekleştirilmesinden sorumludur. Ve bir insanın verebileceği kararları düşünebilir.
    name="ragproxyagent",
    human_input_mode="NEVER",#kullanıcıdan hiçbir şekilde bir giriş beklemeyecektir. ve otomatik olarak devam edecektir.
    max_consecutive_auto_reply=3,#bu özel asistan tarafından çözüm için verilen bir program olduğunda bir şeyler ters gitse bile o buna cevap verip sohbetin akışını sağlamaya yardımcı olacaktır.
    retrieve_config={
        "task": "code",
        "docs_path": [
            "https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md",
            "https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md",
            os.path.join(os.path.abspath(""), "..", "website", "docs"),
        ],
        "custom_text_types": ["non-existent-type"],
        "chunk_token_size": 2000,
        "model": config_list[0]["model"],
        "vector_db": "chroma", 
        "overwrite": False, 
    },
    code_execution_config=False, 
)

  from tqdm.autonotebook import tqdm, trange


In [4]:
assistant.reset()

code_problem = "How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached."
chat_result = ragproxyagent.initiate_chat(
    assistant, message=ragproxyagent.message_generator, problem=code_problem, search_string="spark"
) 

Trying to create collection.


File /Users/gizemkaryagdi/Desktop/Masaüstü /Autogen/../website/docs does not exist. Skipping.
2024-07-12 15:42:42,086 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.[0m
2024-07-12 15:42:42,089 - autogen.agentchat.contrib.vectordb.chromadb - INFO - No content embedding is provided. Will use the VectorDB's embedding function to generate the content embedding.[0m
Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2
Model llama3 not found. Using cl100k_base encoding.


VectorDB returns doc_ids:  [['bdfbc921']]
[32mAdding content of doc bdfbc921 to context.[0m
[33mragproxyagent[0m (to assistant):

You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
For code generation, you must obey the following rules:
Rule 1. You MUST NOT install any packages because all the packages needed are already installed.
Rule 2. You must follow the formats below to write your code:
```language
# your code
```

User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.

Context is: # Integrate - Spark

FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:

- Use Spark ML estimators for AutoML.
- Use Spark 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[33massistant[0m (to ragproxyagent):

Here is the code to use FLAML to perform a classification task and use Spark to do parallel training. Train for 30 seconds and force cancel jobs if time limit is reached:

```
# Import necessary libraries
import flaml as auto

# Prepare your data in pandas-on-spark format
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("FLAML").getOrCreate()

data = {
    "Square_Feet": [800, 1200, 1800, 1500, 850],
    "Age_Years": [20, 15, 10, 7, 25],
    "Price": [100000, 200000, 300000, 240000, 120000],
}

dataframe = spark.createDataFrame(data)
label = "Price"

# Convert to pandas-on-spark dataframe
from flaml.automl.spark.utils import to_pandas_on_spark
psdf = to_pandas_on_spark(dataframe)

# Use VectorAssembler to merge all feature columns into a single vector column
from pyspark.ml.feature import VectorAssembler
feature_cols = [col for col in psdf.columns if col != label]
featurizer = VectorAssembler(inputCols=feature_cols, outpu