# Generate Data Preprocessing file

In [57]:
from context_builder import ContextBuilder
from data_preprocessor import DataPreprocessor

builder = ContextBuilder(
    csv_path="gesture_recognition.csv",
    task_description="Classify between two gestures: punch and flex, based on IMU readings from Arduino Nano 33 BLE Sense.",
    feature_description={
        "aX": "Accelerometer X-axis",
        "aY": "Accelerometer Y-axis",
        "aZ": "Accelerometer Z-axis",
        "gX": "Gyroscope X-axis",
        "gY": "Gyroscope Y-axis",
        "gZ": "Gyroscope Z-axis"
    },
    label_description="The 'label' column indicates the gesture performed. Possible values: 'punch', 'flex'.",
    notes="The IMU readings were captured from the LSM9DS1 sensor at ~100Hz. Each row represents a single gesture frame."
)

context = builder.get_context()

In [58]:
from retriever_instance import RetrieverBuilder
builder = RetrieverBuilder()
#builder.ingest_json_chunks(["converted_data.json"])


[RetrieverBuilder] Loading Chroma from: chroma_db
[RetrieverBuilder] Chroma loaded. Ready to retrieve top-4 documents using cosine similarity.


In [59]:
query = f"{context['task']} {context['notes']}"
retriever = builder.get_retriever()
chunks = retriever.get_relevant_documents(query)


In [60]:
chunks

[Document(metadata={'source': 'examples//lite//examples\\gesture_classification\\ios\\README.md', 'chunk_id': '1-3'}, page_content='gestures you trained in step 4 and the app identifies them in realtime! ## Model Used This app uses [MobileNet](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) model that is trained on 0.25 alpha and at an image size'),
 Document(metadata={'chunk_id': '1-3', 'source': 'examples//lite//examples\\gesture_classification\\ios\\README.md'}, page_content='gestures you trained in step 4 and the app identifies them in realtime! ## Model Used This app uses [MobileNet](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) model that is trained on 0.25 alpha and at an image size'),
 Document(metadata={'source': 'examples//lite//examples\\gesture_classification\\ios\\README.md', 'chunk_id': '1-3'}, page_content='gestures you trained in step 4 and the app identifies them in realtime! ## Model Used This app uses [MobileNet](

In [61]:
preprocessor = DataPreprocessor(context)
task_suggestions = preprocessor.suggest_tasks()

[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.3)
[RetrieverBuilder] Loading Chroma from: chroma_db
[RetrieverBuilder] Chroma loaded. Ready to retrieve top-5 documents using cosine similarity.


In [62]:
from data_preprocessor import PreprocessingCodeGenerator
codegen = PreprocessingCodeGenerator(context, task_suggestions, chunks)

[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.3)


In [63]:
for task, description in task_suggestions.items():
    print(f"\n# === {task.upper()} ===")
    print(codegen.generate_code(task, description))


# === HANDLE_UNNAMED_COLUMN ===
content="df.drop('Unnamed: 0', axis=1, inplace=True)" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 16, 'prompt_tokens': 833, 'total_tokens': 849, 'completion_time': 0.07041316, 'prompt_time': 0.034952327, 'queue_time': 0.093363054, 'total_time': 0.105365487}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_dd4ae1c591', 'finish_reason': 'stop', 'logprobs': None} id='run-a5011a5b-442f-455a-8c5d-440054ffe7a7-0' usage_metadata={'input_tokens': 833, 'output_tokens': 16, 'total_tokens': 849}

# === DROP_UNNECESSARY_FEATURES ===
content="def drop_unnecessary_features(df):\n    df = df.drop(['Unnamed: 0'], axis=1)\n    return df" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 834, 'total_tokens': 860, 'completion_time': 0.080690938, 'prompt_time': 0.034916978, 'queue_time': 0.09315692799999999, 'total_time': 0.115607916}, 'model_name': 'llama3-70b-8192', 'system_finger

In [64]:
import importlib


In [65]:
import merger
from merger import CodeMergerAndFixer

In [66]:
# Step 1: Collect generated codes
task_code_blocks = {}
for task, description in task_suggestions.items():
    code = codegen.generate_code(task, description)
    if hasattr(code, "content"):
        code = code.content
    task_code_blocks[task] = code

# Step 2: Merge and fix using LLM
merger = CodeMergerAndFixer(context , task_code_blocks)

[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.2)


In [67]:
full_code = merger.merge_with_llm()

In [68]:
print(full_code)

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

df = pd.read_csv('gesture_recognition.csv')

df = df.rename(columns={'Unnamed: 0': 'frame_id'})
df = df.drop_duplicates(subset='frame_id')

df = df.drop(['Unnamed: 0'], axis=1)

scaler = StandardScaler()
df[['aX', 'aY', 'aZ', 'gX', 'gY', 'gZ']] = scaler.fit_transform(df[['aX', 'aY', 'aZ', 'gX', 'gY', 'gZ']])

df = df.drop('Unnamed: 0', axis=1)
y = df['label']

oversample = RandomOverSampler(sampling_strategy='auto', random_state=42)
pipeline = Pipeline([('oversample', oversample)])
pipeline.fit_resample(X, y)

df['aMagnitude'] = (df['aX']**2 + df['aY']**2 + df['aZ']**2)**0.5
df['gMagnitude'] = (df['gX']**2 + df['gY']**2 + df['gZ']**2)**0.5

df['aMeanX'] = df['aX'].rolling(window=10).mean()
df['aMeanY'] = df['aY'].rolling(window=10).mean()
df['aMeanZ'] = df['aZ'].rolling(wind

In [69]:
import importlib
import auto_code_executor
importlib.reload(auto_code_executor)
from auto_code_executor import AutoCodeExecutor


In [70]:
executor = AutoCodeExecutor(full_code)

[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.2)


In [71]:
success = executor.try_execute()

🛠 Attempt 1...
🔎 Current Code Preview:
 import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

df = pd.read_csv('gesture_recognition.csv')

df = df.rename(columns={'Unnamed: 0': 'f
❌ Error detected: "['Unnamed: 0'] not found in axis"
🔧 LLM generated a fixed version!
🛠 Attempt 2...
🔎 Current Code Preview:
 ```
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

df = pd.read_csv('gesture_recognition.csv')

df = df.rename(columns={'Unnamed: 0'
❌ Error detected: "['Unnamed: 0'] not found in axis"
🔧 LLM generated a fixed version!
🛠 Attempt 3...
🔎 Current Code Preview:
 ```
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection impo

## Data Preprocesing Pipeline

In [72]:
import importlib
import tools
importlib.reload(tools)

<module 'tools' from 'c:\\Users\\Moham\\OneDrive\\Bureau\\Eurecom\\Semester_Project\\src\\tools.py'>

In [73]:
from tools import (
    load_context,          # 1️⃣
    suggest_preprocessing, # 2️⃣
    generate_code,         # 3️⃣
    merge_snippets,        # 4️⃣
    run_pipeline_from_file       # 5️⃣
)

In [74]:
context_summary = load_context.invoke({
    "csv_path":          "gesture_recognition.csv",
    "task_description":  "Classify IMU frames into 'punch' vs 'flex'",
    "feature_desc_json": '{"aX":"acc-X","aY":"acc-Y","aZ":"acc-Z","gX":"gyro-X","gY":"gyro-Y","gZ":"gyro-Z"}',
    "label_desc":        "gesture: 0 = punch, 1 = flex",
    "notes":             "LSM9DS1 sensor (~100 Hz). One row = one frame."
})
print(context_summary)


Task: Classify IMU frames into 'punch' vs 'flex'
Features:
  - aX: acc-X
  - aY: acc-Y
  - aZ: acc-Z
  - gX: gyro-X
  - gY: gyro-Y
  - gZ: gyro-Z
Label Info: gesture: 0 = punch, 1 = flex
Notes: LSM9DS1 sensor (~100 Hz). One row = one frame.
Sample Data:
  - {'Unnamed: 0': 0, 'aX': 0.067, 'aY': 0.904, 'aZ': 2.018, 'gX': 70.435, 'gY': -10.315, 'gZ': 1.892, 'label': 'punch'}
  - {'Unnamed: 0': 1, 'aX': -0.045, 'aY': 1.269, 'aZ': 2.24, 'gX': 83.191, 'gY': -11.292, 'gZ': 9.583, 'label': 'punch'}
  - {'Unnamed: 0': 2, 'aX': -0.276, 'aY': 1.609, 'aZ': 2.364, 'gX': 94.849, 'gY': -5.249, 'gZ': 12.451, 'label': 'punch'}
  - {'Unnamed: 0': 3, 'aX': -0.564, 'aY': 2.0, 'aZ': 2.359, 'gX': 111.023, 'gY': 2.625, 'gZ': 7.874, 'label': 'punch'}
  - {'Unnamed: 0': 4, 'aX': -0.845, 'aY': 2.311, 'aZ': 2.335, 'gX': 137.878, 'gY': 10.681, 'gZ': -2.197, 'label': 'punch'}


In [75]:
suggest_json = suggest_preprocessing.invoke({"top_k":4})

[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.3)
[RetrieverBuilder] Loading Chroma from: chroma_db
[RetrieverBuilder] Chroma loaded. Ready to retrieve top-4 documents using cosine similarity.


In [26]:
print(suggest_json)

{
  "Drop Unnamed Column": "",
  "Convert Label to Numeric": "",
  "Handle Missing Values": "",
  "Feature Scaling": "",
  "Data Split": "",
  "Data Augmentation": ""
}


In [76]:
import json, ast
task_order = list(json.loads(suggest_json))   # keep original order
success    = True
for task in task_order:
    print(f"\n### Generating snippet for →  {task}")
    snippet_preview = generate_code(task)            # (tool stores snippet)
    print(snippet_preview)                           # first 220 chars

    # merge *all* snippets collected so far
    merge_msg = merge_snippets("")                   # dummy arg required
    print(merge_msg)



### Generating snippet for →  Drop Unnamed Column
[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.3)
[Drop Unnamed Column] snippet stored (533 chars):
content="df.drop('Unnamed: 0', axis=1, inplace=True)" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 16, 'prompt_tokens': 667, 'total_tokens': 683, 'completion_time': 0.071793043, 'prompt_ti …
[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.2)
Merged 1 snippets → generated_code\merged_pipeline.py

### Generating snippet for →  Convert Label to Numerical
[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.3)
[Convert Label to Numerical] snippet stored (623 chars):
content="Here is the Python code to convert the label to numerical:\n```\ndf['label'] = df['label'].map({'punch': 0, 'flex': 1})\n```" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 39, 'pro …
[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.2)
Merged 2 snippets → generated_code\m

### Run the merged_pipeline.py to be viewed by the preprocess module

## Generate Preprocess function

In [89]:
import os
os.chdir("src")
print(os.getcwd())


c:\Users\Moham\OneDrive\Bureau\Eurecom\Semester_Project\src


In [98]:
!python generated_code/merged_pipeline.py


In [101]:
from generated_code.merged_pipeline import add_noise


ImportError: cannot import name 'add_noise' from 'generated_code.merged_pipeline' (c:\Users\Moham\OneDrive\Bureau\Eurecom\Semester_Project\src\generated_code\merged_pipeline.py)

In [91]:

import sys
# Add src/ to sys.path so 'generated_code' is importable
sys.path.insert(0, os.getcwd())

In [None]:
import importlib
import tools
importlib.reload(tools)
import auto_code_executor
importlib.reload(auto_code_executor)
importlib.reload(PreprocessFunctionBuilder)

import generated_code.merged_pipeline
from data_preprocessor import PreprocessFunctionBuilder

from tools import build_preprocess_module

build_preprocess_module.invoke({
    "pipeline_path": "generated_code/merged_pipeline.py",
    "csv_sample": "gesture_recognition.csv"  # optional
})

[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.1)
[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.2)
🛠 Attempt 1...
🔎 Current Code Preview:
 import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

```
from generated_code.merged_pipeline import add_noise

def preprocess(df):
    df.drop('Unnamed: 0, axis=1, inplace=True)
    df['label'] = d
❌ Error detected: unterminated string literal (detected at line 10) (<string>, line 10)
🔁 LLM fixed preprocess function using external imports.
🛠 Attempt 2...
🔎 Current Code Preview:
 Here is the corrected Python code:

```
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from generated_code.merged_pipeline import add_noise

def preprocess(df):
    df.drop(columns='Unnamed: 0
❌ Error detected: invalid syntax (<string>, line 2

'❌ Code generation failed after auto-fixing attempts. Final version saved.'

In [94]:
import inspect
print(inspect.getsource(AutoCodeExecutor))


class AutoCodeExecutor:
    def __init__(self, code: str, max_attempts: int = 5):
        self.original_code = code
        self.code = code
        self.max_attempts = max_attempts
        self.llm = LLMWrapper(temperature=0.2).get_llm()

    def try_execute(self):
        attempt = 0

        while attempt < self.max_attempts:
            try:
                print(f"🛠 Attempt {attempt + 1}...")
                print("🔎 Current Code Preview:\n", self.code[:300])  # Preview first 300 chars

                self.code = self.clean_code(self.code)  # clean each time
                compiled_code = compile(self.code, "<string>", "exec")
                exec(compiled_code, globals())
                
                print("✅ Code executed successfully!")
                return True  # Success
            except Exception as e:
                print(f"❌ Error detected: {e}")
                error_message = traceback.format_exc()

                # Ask LLM to fix the code
                sel

## Generate Training code

In [80]:
from model_trainer_generator import ModelTrainerCodeGenerator
import importlib
import tools
importlib.reload(tools)


<module 'tools' from 'c:\\Users\\Moham\\OneDrive\\Bureau\\Eurecom\\Semester_Project\\src\\tools.py'>

In [81]:
from tools import load_context

load_context.invoke({
    "csv_path": "gesture_recognition.csv",
    "task_description": "Classify between punch and flex using sensor data",
    "feature_desc_json": '{"aX": "acceleration X", "aY": "acceleration Y", "aZ": "acceleration Z", "gX": "gyroscope X", "gY": "gyroscope Y", "gZ": "gyroscope Z"}',
    "label_desc": "0 for punch, 1 for flex",
    "notes": "Gesture recognition with TinyML and sensor fusion"
})


"Task: Classify between punch and flex using sensor data\nFeatures:\n  - aX: acceleration X\n  - aY: acceleration Y\n  - aZ: acceleration Z\n  - gX: gyroscope X\n  - gY: gyroscope Y\n  - gZ: gyroscope Z\nLabel Info: 0 for punch, 1 for flex\nNotes: Gesture recognition with TinyML and sensor fusion\nSample Data:\n  - {'Unnamed: 0': 0, 'aX': 0.067, 'aY': 0.904, 'aZ': 2.018, 'gX': 70.435, 'gY': -10.315, 'gZ': 1.892, 'label': 'punch'}\n  - {'Unnamed: 0': 1, 'aX': -0.045, 'aY': 1.269, 'aZ': 2.24, 'gX': 83.191, 'gY': -11.292, 'gZ': 9.583, 'label': 'punch'}\n  - {'Unnamed: 0': 2, 'aX': -0.276, 'aY': 1.609, 'aZ': 2.364, 'gX': 94.849, 'gY': -5.249, 'gZ': 12.451, 'label': 'punch'}\n  - {'Unnamed: 0': 3, 'aX': -0.564, 'aY': 2.0, 'aZ': 2.359, 'gX': 111.023, 'gY': 2.625, 'gZ': 7.874, 'label': 'punch'}\n  - {'Unnamed: 0': 4, 'aX': -0.845, 'aY': 2.311, 'aZ': 2.335, 'gX': 137.878, 'gY': 10.681, 'gZ': -2.197, 'label': 'punch'}"

In [82]:
from tools import generate_training_code

generate_training_code.invoke({
    "preprocess_module_path": "generated_code/preprocess_module.py"
})


'❌ Failed to import preprocess function: invalid syntax (generated_code/preprocess_module.py, line 1)'

In [4]:
from tools import convert_to_tflite_model
convert_to_tflite_model.invoke({
    "original_model_path": "models/final_model.h5",
    "converted_model_path": "models/final_model.tflite",
    "input_datatype": "float32",
    "output_datatype": "float32",
    "quantization": False
})


[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.3)
[LLMWrapper] Initialized Groq LLM: llama3-70b-8192 (temp=0.2)
🛠 Attempt 1...
🔎 Current Code Preview:
 ```
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model
from tensorflow.lite import TFLiteConverter

try:
    model = load_model('models/final_model.h5', compile=False)
    model.save('models/temp_model', save_format='tf')
    converter = TFLiteConvert
❌ Error detected: cannot import name 'TFLiteConverter' from 'tensorflow.lite' (c:\Users\Moham\OneDrive\Bureau\Eurecom\Semester_Project\tiny_ml\Lib\site-packages\tensorflow\lite\__init__.py)




🔧 LLM generated a fixed version!
🛠 Attempt 2...
🔎 Current Code Preview:
 import tensorflow as tf

try:
    model = tf.keras.models.load_model('models/final_model.h5', compile=False)
    model.save('models/temp_model', save_format='tf')
    converter = tf.lite.TFLiteConverter.from_saved_model('models/temp_model')
except Exception as e:
    print(f"Error occurred: {e}")
  
Error occurred: The `save_format` argument is deprecated in Keras 3. Please remove this argument and pass a file path with either `.keras` or `.h5` extension.Received: save_format=tf
✅ Code executed successfully!


'✅ Model converted to TFLite and saved at models/final_model.tflite.'