In [4]:
import pandas as pd

df_dolly = pd.read_csv("../datasets/raw/dolly15k.csv")
df_opus = pd.read_csv("../datasets/raw/opus100-en-es.csv")
df_squad = pd.read_csv("../datasets/raw/squad_validation.csv")

In [5]:
import textclean

In [6]:
data_configs = [
    # took 4 min 2.5 sec to run
    {
        "data": df_opus,
        "name": "opus",
        "text_columns": ["en", "es"],
    },
    # below together took 177 min!
    {
        "data": df_dolly,
        "name": "dolly",
        "text_columns": ["instruction", "context", "response"],
    },
    {
        "data": df_squad,
        "name": "squad",
        "text_columns": ["context", "question", "answers[0]"],
    },
]

model_names = [
    "all-mpnet-base-v2",  # best performing on leaderboard
    "all-MiniLM-L6-v2",  # smaller and faster
    "distiluse-base-multilingual-cased-v1",  # for multilingual use
]

results = {}

for c in data_configs:
    print("#######", c["name"])
    m = textclean.TextCleaner(c["data"], c["text_columns"], model_names)
    data_res = m.process()
    results[c["name"]] = data_res

####### dolly
extracting metadata for instruction
calculating embeddings for instruction with all-mpnet-base-v2
Created embedding of shape (15011, 768) with all-mpnet-base-v2
calculating embeddings for instruction with all-MiniLM-L6-v2
Created embedding of shape (15011, 384) with all-MiniLM-L6-v2
calculating embeddings for instruction with distiluse-base-multilingual-cased-v1
Created embedding of shape (15011, 512) with distiluse-base-multilingual-cased-v1
extracting metadata for context
calculating embeddings for context with all-mpnet-base-v2
Created embedding of shape (4466, 768) with all-mpnet-base-v2
calculating embeddings for context with all-MiniLM-L6-v2
Created embedding of shape (4466, 384) with all-MiniLM-L6-v2
calculating embeddings for context with distiluse-base-multilingual-cased-v1
Created embedding of shape (4466, 512) with distiluse-base-multilingual-cased-v1
extracting metadata for response
calculating embeddings for response with all-mpnet-base-v2
Created embedding o

In [7]:
procesed_opus = results["opus"]
processed_dolly = results["dolly"]
processed_squad = results["squad"]

In [8]:
procesed_opus.to_parquet("../datasets/processed/opus100-en-es.parquet", index=False)
processed_dolly.to_parquet("../datasets/processed/dolly15k.parquet", index=False)
processed_squad.to_parquet(
    "../datasets/processed/squad_validation.parquet", index=False
)

In [19]:
for c in processed_squad.columns:
    print(c)

title
context
question
answers[0]
context_text_length
context_num_words
context_max_word_length
context_avg_word_length
context_perc_special_chars
context_dist_from_mean_embed_all-mpnet-base-v2
context_outlier_score_ECOD_all-mpnet-base-v2
context_outlier_score_IForest_all-mpnet-base-v2
context_dist_from_mean_embed_all-MiniLM-L6-v2
context_outlier_score_ECOD_all-MiniLM-L6-v2
context_outlier_score_IForest_all-MiniLM-L6-v2
context_dist_from_mean_embed_distiluse-base-multilingual-cased-v1
context_outlier_score_ECOD_distiluse-base-multilingual-cased-v1
context_outlier_score_IForest_distiluse-base-multilingual-cased-v1
question_text_length
question_num_words
question_max_word_length
question_avg_word_length
question_perc_special_chars
question_dist_from_mean_embed_all-mpnet-base-v2
question_outlier_score_ECOD_all-mpnet-base-v2
question_outlier_score_IForest_all-mpnet-base-v2
question_dist_from_mean_embed_all-MiniLM-L6-v2
question_outlier_score_ECOD_all-MiniLM-L6-v2
question_outlier_score_IFo