## Load and Exploration of Data

In [25]:
import pandas as pd
def load_and_explore_csv(file_path, text_column, label_column, num_rows_preview=5):
    try:
        # Load the dataset
        data = pd.read_csv(file_path)

        # Validate required columns
        if text_column not in data.columns or label_column not in data.columns:
            return {"error": f"Columns '{text_column}' and '{label_column}' must exist in the dataset."}

        # General information
        num_rows = len(data)
        num_columns = len(data.columns)
        column_names = data.columns.tolist()

        # Null values
        null_values = data.isnull().sum().to_dict()
        null_percent = {col: f"{(val / num_rows) * 100:.2f}%" for col, val in null_values.items()}

        # Label distribution
        label_counts = data[label_column].value_counts().to_dict()
        total_labels = sum(label_counts.values())
        label_distribution = {k: f"{(v / total_labels) * 100:.2f}%" for k, v in label_counts.items()}

        # Text statistics
        text_lengths = data[text_column].dropna().apply(lambda x: len(str(x).split()))
        text_stats = {
            "average_length": text_lengths.mean(),
            "min_length": text_lengths.min(),
            "max_length": text_lengths.max(),
        }

        # Preview the dataset
        preview = data[[text_column, label_column]].head(num_rows_preview).to_dict(orient="records")

        return {
            "status": "success",
            "preview": preview,
            "general_info": {
                "num_rows": num_rows,
                "num_columns": num_columns,
                "column_names": column_names,
            },
            "null_values": {
                "counts": null_values,
                "percentages": null_percent,
            },
            "label_distribution": {
                "counts": label_counts,
                "percentages": label_distribution,
            },
            "text_statistics": text_stats,
        }
    except Exception as e:
        return {"error": str(e)}

In [28]:
arguments = {
  "file_path": "data/archive (1)/IMDB Dataset.csv",
  "text_column": "review",
  "label_column": "sentiment",
  "num_rows_preview": 5
}
result = load_and_explore_csv(**arguments)

In [29]:
result

{'status': 'success',
 'preview': [{'review': "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would sa

## Preprocess Data

In [30]:
import re
import pandas as pd

def preprocess_text_data_with_options(
    file_path,
    text_column,
    label_column,
    handle_missing="drop",  # Options: 'drop', 'fill'
    fill_value="unknown",  # Used if handle_missing='fill'
    clean_text_options=None,  # Dict for cleaning options
    output_file="preprocessed_data.csv"
):
    """
    Preprocess text data with customizable options for missing values and text cleaning.
    """
    try:
        # Load the dataset
        data = pd.read_csv(file_path)
        
        # Validate required columns
        if text_column not in data.columns or label_column not in data.columns:
            return {"error": f"Columns '{text_column}' and '{label_column}' must exist in the dataset."}
        
        # Handle missing values
        if handle_missing == "drop":
            data = data.dropna(subset=[text_column, label_column])
        elif handle_missing == "fill":
            data[text_column] = data[text_column].fillna(fill_value)
            data[label_column] = data[label_column].fillna(fill_value)
        else:
            return {"error": f"Invalid option for handle_missing: {handle_missing}"}
        
        # Default text cleaning options
        if clean_text_options is None:
            clean_text_options = {
                "lowercase": True,
                "remove_punctuation": True,
                "remove_numbers": True,
                "remove_extra_spaces": True
            }
        
        # Text cleaning function
        def clean_text(text):
            if clean_text_options.get("lowercase", False):
                text = text.lower()
            if clean_text_options.get("remove_punctuation", False):
                text = re.sub(r"[^\w\s]", "", text)
            if clean_text_options.get("remove_numbers", False):
                text = re.sub(r"\d+", "", text)
            if clean_text_options.get("remove_extra_spaces", False):
                text = re.sub(r"\s+", " ", text)
            return text.strip()
        
        # Apply text cleaning
        data[text_column] = data[text_column].apply(clean_text)
        
        # Save the cleaned dataset
        data.to_csv(output_file, index=False)
        
        return {
            "status": "success",
            "message": "Data preprocessing complete.",
            "output_file": output_file,
            "num_rows": len(data),
            "cleaning_options_used": clean_text_options,
            "missing_value_handling": handle_missing
        }
    except Exception as e:
        return {"error": str(e)}

In [32]:
arguments = {
  "file_path": "data/archive (1)/IMDB Dataset.csv",
  "text_column": "review",
  "label_column": "sentiment",
  "handle_missing": "drop",
  "fill_value": "",
  "clean_text_options": {
    "lowercase": True,
    "remove_punctuation": True,
    "remove_numbers": True,
    "remove_extra_spaces": True
  },
  "output_file": "data/preprocessed_imdb_dataset.csv"
}

preprocess_text_data_with_options(**arguments)

{'status': 'success',
 'message': 'Data preprocessing complete.',
 'output_file': 'data/preprocessed_imdb_dataset.csv',
 'num_rows': 50000,
 'cleaning_options_used': {'lowercase': True,
  'remove_punctuation': True,
  'remove_numbers': True,
  'remove_extra_spaces': True},
 'missing_value_handling': 'drop'}

## Data Preparation

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

# Create a dictionary to store prepared datasets by variable name
prepared_data_storage = {}

def prepare_data_for_cnn(
    file_path,
    text_column,
    label_column,
    variable_name,
    num_words=10000,
    max_length=100,
    padding_type="post",
    truncating_type="post",
    test_size=0.2,
    validation_split=None,  # Option to split training into train/validation
    label_encoding="one-hot",  # Options: "one-hot", "integer"
    oov_token="<OOV>"
):
    """
    Prepare text data for CNN training with customizable options and store it with a variable name.
    """
    global prepared_data_storage

    try:
        # Load the dataset
        data = pd.read_csv(file_path)
        
        # Tokenize text
        tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
        tokenizer.fit_on_texts(data[text_column])
        sequences = tokenizer.texts_to_sequences(data[text_column])
        word_index = tokenizer.word_index
        
        # Pad sequences
        padded_sequences = pad_sequences(
            sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type
        )
        
        # Encode labels
        if label_encoding == "one-hot":
            labels = pd.get_dummies(data[label_column]).values
        elif label_encoding == "integer":
            labels = data[label_column].astype("category").cat.codes.values
        else:
            return {"error": f"Invalid label_encoding: {label_encoding}"}
        
        # Split data
        if validation_split:
            X_train, X_test, y_train, y_test = train_test_split(
                padded_sequences, labels, test_size=test_size + validation_split, random_state=42
            )
            val_size = validation_split / (test_size + validation_split)
            X_train, X_val, y_train, y_val = train_test_split(
                X_train, y_train, test_size=val_size, random_state=42
            )
            prepared_data_storage[variable_name] = {
                "X_train": X_train,
                "X_val": X_val,
                "X_test": X_test,
                "y_train": y_train,
                "y_val": y_val,
                "y_test": y_test,
                "tokenizer": tokenizer,
                "word_index": word_index,
            }
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                padded_sequences, labels, test_size=test_size, random_state=42
            )
            prepared_data_storage[variable_name] = {
                "X_train": X_train,
                "X_test": X_test,
                "y_train": y_train,
                "y_test": y_test,
                "tokenizer": tokenizer,
                "word_index": word_index,
            }
        
        return {"status": "success", "variable_name": variable_name}
    except Exception as e:
        return {"error": str(e)}


{'status': 'success', 'variable_name': 'prepared_data'}

In [34]:
arguments = {
  "file_path": "data/preprocessed_imdb_dataset.csv",
  "text_column": "review",
  "label_column": "sentiment",
  "variable_name": "data",
  "num_words": 5000,
  "max_length": 200,
  "padding_type": "post",
  "truncating_type": "post",
  "test_size": 0.2,
  "validation_split": 0.1,
  "label_encoding": "one-hot",
  "oov_token": "<OOV>"
}

prepare_data_for_cnn(**arguments)

{'status': 'success', 'variable_name': 'data'}

## Train Model

In [18]:
def train_cnn_model_chatgpt(
    variable_name: str,
    embedding_dim: int = 100,
    model_layers: list = None,
    optimizer: str = "adam",
    learning_rate: float = 0.001,
    epochs: int = 10,
    batch_size: int = 32,
    validation_data: bool = True,
    early_stopping: bool = False,
    patience: int = 3
) -> dict:
    """
    Train a CNN model dynamically with user-defined architecture for OpenAI function calling.

    Args:
        variable_name (str): Name of the variable containing preprocessed data.
        embedding_dim (int): Dimension of embedding layer.
        model_layers (list): List of dictionaries defining layer configurations.
        optimizer (str): Optimizer to use ('adam', 'sgd', 'rmsprop').
        learning_rate (float): Learning rate for the optimizer.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        validation_data (bool): Whether to use validation data during training.
        early_stopping (bool): Whether to enable early stopping.
        patience (int): Number of epochs to wait before early stopping.

    Returns:
        dict: A structured response with training results or errors.
    """
    try:
        # Access the data
        if variable_name not in prepared_data_storage:
            return {"status": "error", "message": f"Variable '{variable_name}' not found in storage."}
        
        data = prepared_data_storage[variable_name]
        X_train, y_train = data["X_train"], data["y_train"]
        X_val, y_val = data.get("X_val"), data.get("y_val")
        vocab_size = len(data["word_index"]) + 1

        # Default model layers
        if model_layers is None or model_layers is []:
            model_layers = [
                {"type": "Conv1D", "filters": 128, "kernel_size": 5, "activation": "relu"},
                {"type": "MaxPooling1D", "pool_size": 2},
                {"type": "Flatten"},
                {"type": "Dense", "units": 128, "activation": "relu"},
                {"type": "Dropout", "rate": 0.5},
                {"type": "Dense", "units": 1, "activation": "sigmoid"}
            ]
        
        # Build the model
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
        from tensorflow.keras.optimizers import Adam, SGD, RMSprop
        from tensorflow.keras.callbacks import EarlyStopping

        model = Sequential()
        model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=X_train.shape[1]))
        
        for layer in model_layers:
            if layer["type"] == "Conv1D":
                model.add(Conv1D(filters=layer["filters"], kernel_size=layer["kernel_size"], activation=layer["activation"]))
            elif layer["type"] == "MaxPooling1D":
                model.add(MaxPooling1D(pool_size=layer["pool_size"]))
            elif layer["type"] == "Flatten":
                model.add(Flatten())
            elif layer["type"] == "Dense":
                model.add(Dense(units=layer["units"], activation=layer["activation"]))
            elif layer["type"] == "Dropout":
                model.add(Dropout(rate=layer["rate"]))
            else:
                return {"status": "error", "message": f"Unsupported layer type: {layer['type']}"}
        
        # Optimizer selection
        if optimizer.lower() == "adam":
            opt = Adam(learning_rate=learning_rate)
        elif optimizer.lower() == "sgd":
            opt = SGD(learning_rate=learning_rate)
        elif optimizer.lower() == "rmsprop":
            opt = RMSprop(learning_rate=learning_rate)
        else:
            return {"status": "error", "message": f"Unsupported optimizer: {optimizer}"}

        # Compile the model
        model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])

        # Configure callbacks
        callbacks = []
        if early_stopping:
            callbacks.append(EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True))
        
        # Train the model
        if validation_data and X_val is not None:
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=epochs,
                batch_size=batch_size,
                callbacks=callbacks
            )
        else:
            history = model.fit(
                X_train, y_train,
                epochs=epochs,
                batch_size=batch_size,
                callbacks=callbacks
            )

        # Save the trained model
        model_variable_name = f"{variable_name}_model"
        prepared_data_storage[model_variable_name] = model

        return {
            "status": "success",
            "message": "Model training complete.",
            "model_variable_name": model_variable_name,
            "history": history.history
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}


In [21]:
# Output of Chatgpt unction calling
arguments = {
    "variable_name": "prepared_data",
    "embedding_dim": 128,
    "model_layers": [
        {"type": "Conv1D", "filters": 64, "kernel_size": 5, "activation": "relu", "units": 0, "rate": 0, "pool_size": 0},
        {"type": "MaxPooling1D", "filters": 0, "kernel_size": 0, "activation": "", "units": 0, "rate": 0.5, "pool_size": 2},
        {"type": "Conv1D", "filters": 32, "kernel_size": 3, "activation": "relu", "units": 0, "rate": 0, "pool_size": 0},
        {"type": "MaxPooling1D", "filters": 0, "kernel_size": 0, "activation": "", "units": 0, "rate": 0.5, "pool_size": 2},
        {"type": "Flatten", "filters": 0, "kernel_size": 0, "activation": "", "units": 0, "rate": 0, "pool_size": 0},
        {"type": "Dense", "filters": 0, "kernel_size": 0, "activation": "relu", "units": 64, "rate": 0.5, "pool_size": 0},
        {"type": "Dense", "filters": 0, "kernel_size": 0, "activation": "softmax", "units": 1, "rate": 0, "pool_size": 0}
    ],
    "optimizer": "adam",
    "learning_rate": 0.001,
    "epochs": 10,
    "batch_size": 32,
    "validation_data": True,
    "early_stopping": True,
    "patience": 3
}

# Prepare the input for your function
model_layers_cleaned = [
    {k: v for k, v in layer.items() if k in {"type", "filters", "kernel_size", "activation", "units", "rate", "pool_size"}}
    for layer in arguments["model_layers"]
]

# Call the function
result = train_cnn_model_chatgpt(
    variable_name=arguments["variable_name"],
    embedding_dim=arguments["embedding_dim"],
    model_layers=model_layers_cleaned,
    optimizer=arguments["optimizer"],
    learning_rate=arguments["learning_rate"],
    epochs=arguments["epochs"],
    batch_size=arguments["batch_size"],
    validation_data=arguments["validation_data"],
    early_stopping=arguments["early_stopping"],
    patience=arguments["patience"]
)

# Output the result
print(result)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
{'status': 'success', 'message': 'Model training complete.', 'model_variable_name': 'prepared_data_model', 'history': {'loss': [0.4690515100955963, 0.26562759280204773, 0.13569337129592896, 0.0425708033144474, 0.02668195776641369], 'accuracy': [0.49841418862342834, 0.49841418862342834, 0.49841418862342834, 0.49841418862342834, 0.49841418862342834], 'val_loss': [0.39600610733032227, 0.3940882086753845, 0.4806673228740692, 0.8704310655593872, 0.8719384074211121], 'val_accuracy': [0.4955858290195465, 0.4955858290195465, 0.4955858290195465, 0.4955858290195465, 0.4955858290195465]}}


In [24]:
arguments = {
  "variable_name": "prepared_data",
  "embedding_dim": 128,
  "model_layers": [
    
    {
      "type": "Conv1D",
      "filters": 128,
      "kernel_size": 5,
      "activation": "relu",
      "units": 0,
      "rate": 0,
      "pool_size": 0
    },
    {
      "type": "MaxPooling1D",
      "filters": 0,
      "kernel_size": 0,
      "activation": "",
      "units": 0,
      "rate": 0,
      "pool_size": 2
    },
    {
      "type": "Dropout",
      "filters": 0,
      "kernel_size": 0,
      "activation": "",
      "units": 0,
      "rate": 0.5,
      "pool_size": 0
    },
    {
      "type": "Dense",
      "filters": 0,
      "kernel_size": 0,
      "activation": "relu",
      "units": 10,
      "rate": 0,
      "pool_size": 0
    },
    {
      "type": "Dense",
      "filters": 0,
      "kernel_size": 0,
      "activation": "sigmoid",
      "units": 1,
      "rate": 0,
      "pool_size": 0
    }
  ],
  "optimizer": "adam",
  "learning_rate": 0.001,
  "epochs": 5,
  "batch_size": 64,
  "validation_data": True,
  "early_stopping": False,
  "patience": 0
}

# Prepare the input for your function
model_layers_cleaned = [
    {k: v for k, v in layer.items() if k in {"type", "filters", "kernel_size", "activation", "units", "rate", "pool_size"}}
    for layer in arguments["model_layers"]
]

# Call the function
result = train_cnn_model_chatgpt(
    variable_name=arguments["variable_name"],
    embedding_dim=arguments["embedding_dim"],
    model_layers=model_layers_cleaned,
    optimizer=arguments["optimizer"],
    learning_rate=arguments["learning_rate"],
    epochs=arguments["epochs"],
    batch_size=arguments["batch_size"],
    validation_data=arguments["validation_data"],
    early_stopping=arguments["early_stopping"],
    patience=arguments["patience"]
)

# Output the result
print(result)


Epoch 1/5
{'status': 'error', 'message': 'in user code:\n\n    File "c:\\Users\\ashis\\anaconda3\\envs\\new_env\\lib\\site-packages\\keras\\src\\engine\\training.py", line 1338, in train_function  *\n        return step_function(self, iterator)\n    File "c:\\Users\\ashis\\anaconda3\\envs\\new_env\\lib\\site-packages\\keras\\src\\engine\\training.py", line 1322, in step_function  **\n        outputs = model.distribute_strategy.run(run_step, args=(data,))\n    File "c:\\Users\\ashis\\anaconda3\\envs\\new_env\\lib\\site-packages\\keras\\src\\engine\\training.py", line 1303, in run_step  **\n        outputs = model.train_step(data)\n    File "c:\\Users\\ashis\\anaconda3\\envs\\new_env\\lib\\site-packages\\keras\\src\\engine\\training.py", line 1081, in train_step\n        loss = self.compute_loss(x, y, y_pred, sample_weight)\n    File "c:\\Users\\ashis\\anaconda3\\envs\\new_env\\lib\\site-packages\\keras\\src\\engine\\training.py", line 1139, in compute_loss\n        return self.compiled_