In [1]:
%%capture
!pip install pandas

# Dataset normalization

Here the intent is to clean up data, post-processing the output received from the LLMs.


In [5]:
from typing import List
from os import listdir
from os.path import join, isdir, exists

resources_path: str = join("..", "resources")

MODELS: List[str] = [d for d in listdir(resources_path) if isdir(join(resources_path, d))]

DATASET_PATHS: List[str] = []
for model in MODELS:
    DATASET_PATHS.append(f"{resources_path}/{model}/sampled_reviews_with_output_multicall_{model}.csv")
    DATASET_PATHS.append(f"{resources_path}/{model}/sampled_reviews_with_output_{model}.csv")

assert all([exists(path) for path in DATASET_PATHS]), "All dataset must exists! Please, check"

DATASET_PATHS

['../resources/gemma2_9b/sampled_reviews_with_output_multicall_gemma2_9b.csv',
 '../resources/gemma2_9b/sampled_reviews_with_output_gemma2_9b.csv',
 '../resources/qwen2_7b/sampled_reviews_with_output_multicall_qwen2_7b.csv',
 '../resources/qwen2_7b/sampled_reviews_with_output_qwen2_7b.csv',
 '../resources/llama3.1/sampled_reviews_with_output_multicall_llama3.1.csv',
 '../resources/llama3.1/sampled_reviews_with_output_llama3.1.csv',
 '../resources/phi3_medium/sampled_reviews_with_output_multicall_phi3_medium.csv',
 '../resources/phi3_medium/sampled_reviews_with_output_phi3_medium.csv',
 '../resources/mistral_7b/sampled_reviews_with_output_multicall_mistral_7b.csv',
 '../resources/mistral_7b/sampled_reviews_with_output_mistral_7b.csv']

In [6]:
import pandas as pd

sampled: pd.DataFrame = pd.read_csv("../resources/IMDB Dataset Sampled.csv")
p_indexes: List[int] = sampled["progressive_index"].tolist()


def check_for_no_case_missing(dataframe_: pd.DataFrame) -> None:
    total_cases_count = len(dataframe_)
    assert total_cases_count == 1000, f"There must be 1.000 cases, but {total_cases_count}"

    missing_rows: pd.DataFrame = dataframe_[~dataframe_.progressive_index.isin(p_indexes)]
    missing_rows_count = len(missing_rows)
    assert missing_rows_count == 0, f"There are missing progressive indexes {missing_rows_count}"

    not_processed: pd.DataFrame = dataframe_[dataframe_.output == "$$$"]
    not_processed_count = len(not_processed)
    assert not_processed_count == 0, f"There are missing outputs {not_processed_count}"
    pass

In [7]:
ALLOWED_COLUMNS: List[str] = ["review", "sentiment", "entities", "json", "progressive_index", "output"]


def drop_unnecessary_columns(dataframe_: pd.DataFrame) -> pd.DataFrame:
    columns_to_remove: List[str] = [col for col in dataframe_.columns.tolist() if col not in ALLOWED_COLUMNS]
    dataframe_.drop(columns=columns_to_remove, inplace=True)
    return dataframe_

In [12]:
import re


def extract_json_output(row: pd.DataFrame) -> pd.DataFrame:
    output: str = row["output"]
    # Removing comments in form of //
    output = re.sub(pattern=r"(?<!\S)//.*?$", flags=re.MULTILINE, string=output, repl="")
    # Extracting JSON-only
    groups = re.findall(pattern=r"(\{.*})", flags=re.DOTALL, string=output)
    groups_count = len(groups)
    if groups_count != 1:
        print(f"There must be exactly one group, but {groups_count} found: {output[:20]} [...]")
        row["json_output"] = None
        return row

    row["json_output"] = groups[0]
    return row


In [13]:
for dataset_path in DATASET_PATHS:
    dataframe_: pd.DataFrame = pd.read_csv(dataset_path)
    check_for_no_case_missing(dataframe_)
    drop_unnecessary_columns(dataframe_)
    dataframe: pd.DataFrame = dataframe_.apply(lambda row: extract_json_output(row), axis=1)
    dataframe.to_csv(dataset_path, index=False)


There must be exactly one group, but 0 found: This is a great anal [...]
There must be exactly one group, but 0 found: This text appears to [...]
There must be exactly one group, but 0 found: Based on the descrip [...]
There must be exactly one group, but 0 found: Output: Case of diss [...]
There must be exactly one group, but 0 found: Based on your provid [...]
There must be exactly one group, but 0 found: Here's the analysis  [...]
There must be exactly one group, but 0 found:  Based on the review [...]
There must be exactly one group, but 0 found:  ```json
{
  "sentim [...]
There must be exactly one group, but 0 found:  ```json
{
  "sentim [...]
There must be exactly one group, but 0 found:  ```json
{
  "sentim [...]
There must be exactly one group, but 0 found:  ```json
{
  "sentim [...]
There must be exactly one group, but 0 found:  ```json
{
  "sentim [...]
There must be exactly one group, but 0 found:  Given that there ar [...]
There must be exactly one group, but 0 found:  ```j