<h1> Notebook to build a new database of AI generations since I found out that LaMini Instruction contains human generated prompts/answers :clown: <h1>

In [None]:
import pandas as pd
from datasets import load_dataset, load_from_disk
import numpy as np
import seaborn as sns
import jieba

In [None]:
english_datasets = {"nomic-ai/gpt4all-j-prompt-generations": "response", 
                    "sahil2801/CodeAlpaca-20k": "output", 
                    "dkoterwa/ai_society_instructions": "output",
                    "dkoterwa/camel_ai_biology_instruction_dataset": "response",
                    "dkoterwa/camel_ai_physics_instruction_dataset": "response",
                    "dkoterwa/camel_ai_chemistry_instruction_dataset": "response",
                    "dkoterwa/camel_ai_maths_instruction_dataset": "response",
                    "vicgalle/alpaca-gpt4": "output"}

In [None]:
for name, answer_column in english_datasets.items():
    print(name)
    df = load_dataset(name, split="train", download_mode="force_redownload").to_pandas()
    df["answer_len"] = [len(answer.split(" ")) for answer in df[answer_column]]
    df = df[df["answer_len"] >= 20]
    df.reset_index(drop=True, inplace=True)
    num_samples = len(df)
    avg_text_length = sum(len(text.split(" ")) for text in df[answer_column]) / num_samples
    max_text_length = max(len(text.split(" ")) for text in df[answer_column])
    min_text_length = min(len(text.split(" ")) for text in df[answer_column])
    std_text_length = np.std([len(text.split(" ")) for text in df[answer_column]])
    median_text_length = np.median([len(text.split(" ")) for text in df[answer_column]])

In [None]:
texts = []
n_of_observations = 0
for name, answer_column in english_datasets.items():
    print(name)
    df = load_dataset(name, split="train", download_mode="force_redownload").to_pandas()
    n_of_observations += df.shape[0]
    texts.extend(df[answer_column].tolist())

In [None]:
lamini = load_dataset("MBZUAI/LaMini-instruction", split="train").to_pandas()

In [None]:
generated_texts = lamini[lamini["instruction_source"].str.contains("generated") | lamini["instruction_source"].str.contains("self_instruct")]
texts.extend(generated_texts["response"].tolist())
n_of_observations += generated_texts.shape[0]

In [None]:
assert n_of_observations == len(texts)
database_en = pd.DataFrame({"text": texts, "lang": "en"})
database_en["answer_len"] = [len(text.split(" ")) for text in database_en["text"]]
database_en = database_en[database_en["answer_len"] >= 20]

In [None]:
other_datasets = {"dkoterwa/alpaca_gpt_4_ar": "ar",
                  "dkoterwa/alpaca_gpt_4_zh": "zh",
                  "dkoterwa/alpaca_gpt_4_es": "es",
                  "dkoterwa/alpaca_gpt_4_de": "de",
                  "5CD-AI/Vietnamese-c-s-ale-alpaca-gpt4-data-gg-translated": "vi"}
dfs = []
n_of_observations = 0
for dataset, lang in other_datasets.items():
    print(dataset)
    df = load_dataset(dataset, split="train", download_mode="force_redownload").to_pandas()
    n_of_observations += df.shape[0]
    if lang != "vi":
        df_with_lang = pd.DataFrame({"text": df["answer"], "lang": lang})
    else:
        df_with_lang = pd.DataFrame({"text": df["output_vi"], "lang": lang})
        
    dfs.append(df_with_lang)

In [None]:
database_other_languages = pd.concat(dfs)
assert n_of_observations == len(database_other_languages)
database_other_languages["answer_len"] = [len(row["text"].split(" ")) if row["lang"] != "zh" else len(jieba.lcut(row["text"])) for _, row in database_other_languages.iterrows()]
database_other_languages = database_other_languages[database_other_languages["answer_len"] >= 20]

In [None]:
database_en.reset_index(drop=True, inplace=True)
database_other_languages.reset_index(drop=True, inplace=True)

In [None]:
full_database = pd.concat([database_en, database_other_languages])
full_database.reset_index(drop=True, inplace=True)

In [None]:
full_database.shape

In [None]:
full_database["lang"].value_counts()

In [None]:
full_database.to_pickle("../data/database.pkl")