In [1]:
# Cell 1
import pandas as pd
import numpy as np
import re
from io import StringIO
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import chromadb

# optional splitter
try:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    LANGCHAIN_AVAILABLE = True
except Exception:
    LANGCHAIN_AVAILABLE = False

# NLTK for stemming/lemmatization/stopwords (used only if requested)
import nltk
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
nltk.download("stopwords", quiet=True)
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:
# Cell 2
def load_csv(path_or_buffer):
    """Loads CSV from local path or file-like object. Returns df or raises."""
    df = pd.read_csv(path_or_buffer)
    return df

def preview_random(df, n=5):
    n = min(n, len(df))
    return df.sample(n=n, random_state=42).reset_index(drop=True)

def show_dtypes(df):
    return pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})


In [3]:
# Cell 3
def detect_html_columns(df, max_rows=500):
    cols = []
    sample = df.head(max_rows).astype(str)
    for c in sample.columns:
        if sample[c].str.contains(r"<.*?>", regex=True).any():
            cols.append(c)
    return cols

def detect_missing(df):
    missing_pct = df.isna().mean()  # per col
    total_missing = df.isna().sum().sum()
    return {"per_column_pct": missing_pct.to_dict(), "total_missing": int(total_missing), "has_missing": total_missing>0}

def detect_duplicates(df):
    dup_count = df.duplicated().sum()
    return {"dup_count": int(dup_count), "has_duplicates": dup_count>0}

def detect_multiline(df, max_rows=500):
    cols = []
    sample = df.head(max_rows).astype(str)
    for c in sample.columns:
        if sample[c].str.contains(r"\n|\r", regex=True).any():
            cols.append(c)
    return cols

def detect_delimiter_problem(df):
    # heuristic: too many commas inside many fields
    sample = df.head(200).astype(str)
    count_commas = sample.apply(lambda col: col.str.contains(",").sum())
    # if any column has > 60% rows containing commas, it's suspicious
    return bool((count_commas > 0.6*len(sample)).any())


In [4]:
# Cell 4
def to_lowercase(df):
    df2 = df.copy()
    for c in df2.select_dtypes(include=["object", "string"]).columns:
        df2[c] = df2[c].astype(str).str.lower()
    return df2

def remove_html_tags(df, columns=None):
    df2 = df.copy()
    if columns is None:
        columns = detect_html_columns(df2)
    for c in columns:
        df2[c] = df2[c].astype(str).apply(lambda s: BeautifulSoup(s, "html.parser").get_text())
    return df2


In [5]:
# Cell 5
def handle_missing(df, action="none", fill_value="unknown"):
    df2 = df.copy()
    if action == "fill":
        df2 = df2.fillna(fill_value)
    elif action == "drop":
        df2 = df2.dropna()
    return df2

def drop_duplicates(df, keep="first"):
    return df.drop_duplicates(keep=keep)

def fix_delimiters(df):
    # try re-parsing CSV text to avoid issues — works only with original CSV textual input.
    csv_buffer = df.to_csv(index=False)
    df2 = pd.read_csv(StringIO(csv_buffer), sep=",", engine="python", quotechar='"')
    return df2

def fix_multiline_cells(df):
    df2 = df.replace(r"[\r\n]+", " ", regex=True)
    return df2


In [6]:
# Cell 6
ps = PorterStemmer()
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def normalize_text(s, remove_stopwords=False, stem=False, lemmatize=False):
    tokens = word_tokenize(str(s))
    if remove_stopwords:
        tokens = [t for t in tokens if t.lower() not in stop_words]
    if lemmatize:
        tokens = [wnl.lemmatize(t) for t in tokens]
    if stem:
        tokens = [ps.stem(t) for t in tokens]
    return " ".join(tokens)

def apply_text_normalization(df, text_columns, remove_stopwords=False, stem=False, lemmatize=False):
    df2 = df.copy()
    for c in text_columns:
        df2[c] = df2[c].astype(str).apply(lambda s: normalize_text(s, remove_stopwords, stem, lemmatize))
    return df2


In [7]:
# Cell 7
def compute_quality_metrics(df, text_cols=None):
    metrics = {}
    metrics["num_rows"] = len(df)
    metrics["num_columns"] = len(df.columns)
    miss = detect_missing(df)
    metrics["total_missing"] = miss["total_missing"]
    dup = detect_duplicates(df)
    metrics["dup_count"] = dup["dup_count"]
    # tokens per row (estimate)
    if text_cols is None:
        text_cols = [c for c in df.columns if df[c].dtype == object][:1]  # pick first text col
    token_counts = []
    for _, row in df.iterrows():
        text = " ".join([str(row[c]) for c in text_cols]) if text_cols else " ".join(map(str,row.to_list()))
        token_counts.append(len(text.split()))
    metrics["avg_tokens_per_row"] = float(np.mean(token_counts)) if token_counts else 0.0
    return metrics

def quality_gate_pass(metrics, thresholds=None):
    # thresholds example:
    if thresholds is None:
        thresholds = {"max_missing": 0.2, "max_dup_pct": 0.05, "min_avg_tokens": 5}
    # compute missing fraction
    missing_frac = metrics["total_missing"] / max(1, metrics["num_rows"]*metrics["num_columns"])
    dup_frac = metrics["dup_count"] / max(1, metrics["num_rows"])
    avg_tokens = metrics["avg_tokens_per_row"]
    pass_cond = (missing_frac <= thresholds["max_missing"]) and (dup_frac <= thresholds["max_dup_pct"]) and (avg_tokens >= thresholds["min_avg_tokens"])
    return pass_cond, {"missing_frac": missing_frac, "dup_frac": dup_frac, "avg_tokens": avg_tokens}


In [8]:
# Cell 8
def document_chunks(df):
    docs = []
    metas = []
    for i,row in df.iterrows():
        d = " | ".join([f"{c}: {row[c]}" for c in df.columns])
        docs.append(d)
        metas.append({c: sanitize_meta(v) for c,v in row.to_dict().items()})
    return docs, metas

def fixed_row_chunks(df):
    docs = []
    metas = []
    for i,row in df.iterrows():
        text = " | ".join([f"{c}: {row[c]}" for c in df.columns])
        docs.append(text)
        metas.append({c: sanitize_meta(v) for c,v in row.to_dict().items()})
    return docs, metas

def semantic_chunks(df, text_col):
    docs=[]; metas=[]
    for i,row in df.iterrows():
        text = str(row[text_col])
        docs.append(text)
        metas.append({c: sanitize_meta(v) for c,v in row.to_dict().items()})
    return docs, metas

def recursive_chunks(df, chunk_size=400, overlap=50):
    docs=[]
    metas=[]
    texts = [" ".join(map(str,row.to_list())) for _,row in df.iterrows()]
    if LANGCHAIN_AVAILABLE:
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap, separators=["\n\n","\n",". "," ", ""])
        for i,t in enumerate(texts):
            parts = splitter.split_text(t)
            for j,p in enumerate(parts):
                docs.append(p)
                metas.append({"parent_row": i, "child_index": j})
    else:
        import re
        for i,t in enumerate(texts):
            paras = re.split(r"\n\n+", t)
            for j,p in enumerate(paras):
                if len(p) <= chunk_size:
                    docs.append(p); metas.append({"parent_row": i, "child_index": j})
                else:
                    sents = re.split(r'(?<=[.!?])\s+', p)
                    buf=""; idx=0
                    for s in sents:
                        if len(buf)+len(s) <= chunk_size:
                            buf = (buf + " " + s).strip()
                        else:
                            if buf:
                                docs.append(buf); metas.append({"parent_row": i, "child_index": idx}); idx+=1
                            buf = s
                    if buf:
                        docs.append(buf); metas.append({"parent_row": i, "child_index": idx})
    return docs, metas

def sanitize_meta(v):
    if v is None: return None
    if isinstance(v, (str,int,float,bool)): return v
    try:
        if hasattr(v, "isoformat"):
            return str(v)
    except Exception:
        pass
    return str(v)


In [9]:
# Cell 9
def embed_texts(model_name, texts, batch_size=128):
    model = SentenceTransformer(model_name)
    embs = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    return embs

def store_to_chroma(collection_prefix, model_name, ids, documents, embeddings, metadatas, chroma_path="chromadb_store"):
    client = chromadb.PersistentClient(path=chroma_path)
    cname = f"{collection_prefix}_{model_name.replace('/','_').replace(':','_')}"
    try:
        col = client.get_collection(cname)
    except Exception:
        col = client.create_collection(cname)
    # optional remove previous
    if col.count() > 0:
        try:
            ex = col.get()
            if "ids" in ex and ex["ids"]:
                col.delete(ids=ex["ids"])
        except Exception:
            pass
    # add in batches
    B = 500
    N = len(ids)
    for s in range(0,N,B):
        e = min(s+B,N)
        col.add(ids=ids[s:e], documents=documents[s:e], embeddings=embeddings[s:e], metadatas=metadatas[s:e])
    return col

def retrieve_from_chroma(collection_prefix, model_name, query, k=5, chroma_path="chromadb_store", distance_threshold=0.6):
    client = chromadb.PersistentClient(path=chroma_path)
    cname = f"{collection_prefix}_{model_name.replace('/','_').replace(':','_')}"
    try:
        col = client.get_collection(cname)
    except Exception:
        return {"found": False, "reason": "collection_not_found", "results": []}
    model = SentenceTransformer(model_name)
    q_emb = model.encode([query])
    res = col.query(query_embeddings=q_emb, n_results=k, include=["documents","metadatas","distances"])
    docs = res.get("documents", [[]])[0]
    metas = res.get("metadatas", [[]])[0]
    dists = res.get("distances", [[]])[0]
    if not docs:
        return {"found": False, "reason": "no_results", "results": []}
    if dists and min(dists) > distance_threshold:
        return {"found": False, "reason": "too_far", "min_distance": min(dists), "results": []}
    out = [{"doc":d,"meta":m,"dist":float(dist)} for d,m,dist in zip(docs,metas,dists)]
    return {"found": True, "results": out}


In [10]:

import pandas as pd
import re
from typing import Tuple, Dict, Any

def preview_random(df, n=5):
    return df.sample(n) if len(df) > n else df

def to_lowercase(df):
    return df.applymap(lambda s: s.lower() if isinstance(s, str) else s)

def remove_html_tags(df, columns=None):
    html_pattern = re.compile(r'<.*?>')
    if columns:
        for col in columns:
            if col in df.columns:
                df[col] = df[col].astype(str).apply(lambda x: re.sub(html_pattern, '', x))
    return df

def handle_missing_values(df: pd.DataFrame, strategy: str, custom_value: Any = None) -> pd.DataFrame:
    if strategy == 'drop':
        return df.dropna()
    elif strategy == 'fill':
        return df.fillna(custom_value)
    else:
        return df

def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates()

def process_text(df: pd.DataFrame, text_columns: list, stemming: bool, lemmatization: bool, stopword_removal: bool) -> pd.DataFrame:
    return df

def quality_gate(df: pd.DataFrame) -> bool:
    return df.isnull().sum().sum() == 0

def apply_chunking(df: pd.DataFrame, strategy: str, chunk_size: int = None, overlap: int = 0) -> list:
    chunks = []
    text_data = df.to_string(index=False)
    if strategy == 'fixed':
        for i in range(0, len(text_data), chunk_size):
            chunks.append(text_data[i:i + chunk_size])
    elif strategy == 'recursive':
        i = 0
        while i < len(text_data):
            chunk = text_data[i:i + chunk_size]
            chunks.append(chunk)
            i += (chunk_size - overlap)
    elif strategy == 'document':
        chunks = df.apply(lambda row: row.to_dict(), axis=1).tolist()
    return chunks

def apply_embedding(chunks: list, strategy: str) -> list:
    embedded_chunks = []
    for chunk in chunks:
        embedded_chunks.append(chunk)
    return embedded_chunks

def store_in_chromadb(embedded_chunks: list) -> None:
    pass
