In [1]:
import pandas as pd
from google.cloud import translate_v2 as translate
import warnings
import json
from tqdm import tqdm
import numpy as np
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "valued-context-412119-a7e6fbab1b7f.json"

warnings.filterwarnings("ignore")

In [2]:
mono_train_path = "./SubtaskA/subtaskA_train_monolingual.jsonl"
mono_val_path = "./SubtaskA/subtaskA_dev_monolingual.jsonl"
multi_train_path = "./SubtaskA/subtaskA_train_multilingual.jsonl"
multi_val_path = "./SubtaskA/subtaskA_dev_multilingual.jsonl"

In [3]:
def read_jsonl(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            json_data = json.loads(line)
            data.append(json_data)
    return data


def get_pandas_dfs(train_path, val_path, train_sample=None, val_sample=None):
    train_data = read_jsonl(train_path)
    val_data = read_jsonl(val_path)
    
    train_df = pd.DataFrame(train_data).loc[:, ["text", "label", "source"]]
    train_df["label"] = train_df["label"].map(lambda x: "LLM" if x==1 else "human")
    val_df = pd.DataFrame(val_data).loc[:, ["text", "label", "source"]]
    val_df["label"] = val_df["label"].map(lambda x: "LLM" if x==1 else "human")
    
    if train_sample:
        train_df = train_df.sample(n=train_sample, random_state=42).reset_index(drop=True)
    if val_sample:
        val_df = val_df.sample(n=val_sample, random_state=42).reset_index(drop=True)
    
    return train_df, val_df


mono_train_df, mono_val_df = get_pandas_dfs(train_path=mono_train_path, val_path=mono_val_path)
multi_train_df, multi_val_df = get_pandas_dfs(train_path=multi_train_path, val_path=multi_val_path)

In [4]:
mono_train_df = mono_train_df
multi_train_df = multi_train_df

In [5]:
print(mono_train_df.head())

                                                text label   source
0  Forza Motorsport is a popular racing game that...   LLM  wikihow
1  Buying Virtual Console games for your Nintendo...   LLM  wikihow
2  Windows NT 4.0 was a popular operating system ...   LLM  wikihow
3  How to Make Perfume\n\nPerfume is a great way ...   LLM  wikihow
4  How to Convert Song Lyrics to a Song'\n\nConve...   LLM  wikihow


In [6]:
print(multi_train_df.head())

                                                text label   source
0  Forza Motorsport is a popular racing game that...   LLM  wikihow
1  Buying Virtual Console games for your Nintendo...   LLM  wikihow
2  Windows NT 4.0 was a popular operating system ...   LLM  wikihow
3  How to Make Perfume\n\nPerfume is a great way ...   LLM  wikihow
4  How to Convert Song Lyrics to a Song'\n\nConve...   LLM  wikihow


In [7]:
print(mono_train_df.shape)

(119757, 3)


In [8]:
print(multi_train_df.shape)

(172417, 3)


# Data Augmentation

In [9]:
translate_client = translate.Client()

In [10]:
def translate_text(text, source_language, target_language):
    result = translate_client.translate(text, source_language=source_language, target_language=target_language)
    return result['translatedText']

In [11]:
def backtranslate_text(text, original_language, target_language):
    translated_text = translate_text(text, source_language=original_language, target_language=target_language)
    backtranslated_text = translate_text(translated_text, source_language=target_language, target_language=original_language)

    return backtranslated_text

In [12]:
def filter_dataframe(df, max_chars, valid_sources, valid_labels):
    return df[(df['text'].str.len() <= max_chars) &
              (df['source'].isin(valid_sources)) &
              (df['label'].isin(valid_labels))
    ]

In [13]:
def translate_and_save(df, target_language, output_file):
    translated_texts = []

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Translating to {target_language}"):
        translated_text = translate_text(row['text'], 'en', target_language)

        translated_texts.append({
            'text': translated_text,
            'label': row['label'],
            'source': row['source']
        })

    translated_df = pd.DataFrame(translated_texts)
    translated_df.to_json(output_file, orient='records', lines=True)

In [14]:
def backtranslate_and_save(df, source_language, target_language, output_file):
    backtranslated_texts = []

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Backtranslating from {source_language} to {target_language}"):
        backtranslated_text = backtranslate_text(row['text'], original_language=source_language, target_language=target_language)

        backtranslated_texts.append({
            'text': backtranslated_text,
            'label': row['label'],
            'source': row['source']
        })

    backtranslated_df = pd.DataFrame(backtranslated_texts)
    backtranslated_df.to_json(output_file, orient='records', lines=True)

# 1. Monolingual task

In [15]:
m_chars = 1250
v_sources = ['wikihow', 'wikipedia', 'reddit', 'arxiv', 'peerread']

filtered_mono_train_df_llm = filter_dataframe(mono_train_df, m_chars, v_sources, ['LLM'])
filtered_mono_train_df_hum = filter_dataframe(mono_train_df, m_chars, v_sources, ['human'])

print(filtered_mono_train_df_llm.shape)
print(filtered_mono_train_df_llm.head())

print(filtered_mono_train_df_hum.shape)
print(filtered_mono_train_df_hum.head())

(14771, 3)
                                                   text label   source
1639  1. Visit The TrafficWave website and sign up f...   LLM  wikihow
1666  Unfortunately, as an AI language model, I cann...   LLM  wikihow
1813  In this article, we learned about deriving a m...   LLM  wikihow
2004  \n\nHow to Convert Song Lyrics to a Song\nTher...   LLM  wikihow
2005  \n\nA broken window in a wooden frame can be f...   LLM  wikihow
(14371, 3)
                                                    text  label   source
56419   If you just can't, reduce meat from your meal...  human  wikihow
56434   You should only harvest the honey if the bees...  human  wikihow
56455   It is also known as the horse's "blind side"....  human  wikihow
56499   This might give him the initiative to talk to...  human  wikihow
56512   Although a smile may not be much, just making...  human  wikihow


In [16]:
set1_llm = filtered_mono_train_df_llm.sample(n=8000)
set1_hum = filtered_mono_train_df_hum.sample(n=8000)
set2_llm = filtered_mono_train_df_llm.sample(n=8000)
set2_hum = filtered_mono_train_df_hum.sample(n=8000)
set3_llm = filtered_mono_train_df_llm.sample(n=8000)
set3_hum = filtered_mono_train_df_hum.sample(n=8000)

In [17]:
set1 = pd.concat([set1_llm, set1_hum])
set2 = pd.concat([set2_llm, set2_hum])
set3 = pd.concat([set3_llm, set3_hum])

In [18]:
print(set1.shape)
print(set2.shape)
print(set3.shape)

(16000, 3)
(16000, 3)
(16000, 3)


In [19]:
backtranslate_and_save(set1, source_language='en', target_language='fr', output_file='mono_addon1.jsonl')

Backtranslating from en to fr: 100%|██████████| 16000/16000 [1:25:44<00:00,  3.11it/s]


In [20]:
backtranslate_and_save(set2, source_language='en', target_language='ru', output_file='mono_addon2.jsonl')

Backtranslating from en to ru: 100%|██████████| 16000/16000 [1:27:42<00:00,  3.04it/s]


In [19]:
backtranslate_and_save(set3, source_language='en', target_language='ja', output_file='mono_addon3.jsonl')

Backtranslating from en to ja: 100%|██████████| 16000/16000 [1:32:21<00:00,  2.89it/s]  


# 2. Multilingual task

In [15]:
m_chars = 1500
v_sources = ['wikihow', 'wikipedia', 'reddit', 'arxiv', 'peerread']

filtered_multi_train_df_llm = filter_dataframe(multi_train_df, m_chars, v_sources, ['LLM'])
filtered_multi_train_df_hum = filter_dataframe(multi_train_df, m_chars, v_sources, ['human'])

print(filtered_multi_train_df_llm.shape)
print(filtered_multi_train_df_llm.head())

print(filtered_multi_train_df_hum.shape)
print(filtered_multi_train_df_hum.head())

(35942, 3)
                                                   text label   source
89    How to Own a Rifle: A Guide to Being a Respons...   LLM  wikihow
320   Creating a Cloudy Moon or Planet with Excel ca...   LLM  wikihow
987   Constructing multiple squares of odd order can...   LLM  wikihow
1154  Are you a fan of racing games? Do you want to ...   LLM  wikihow
1245  Creating a template for Photobie Design Studio...   LLM  wikihow
(24744, 3)
                                                   text  label   source
2001   They are about $20 a card. Or, if you want to...  human  wikihow
2007  ;\n, Place a level on the surface of the groun...  human  wikihow
2017   Do not attempt unless you have a complete und...  human  wikihow
2026  ;\n, List all possible combinations as well.\n...  human  wikihow
2037  ;\n, Be sure that you remove all damaged drywa...  human  wikihow


In [16]:
set1_llm = filtered_multi_train_df_llm.sample(n=8000)
set1_hum = filtered_multi_train_df_hum.sample(n=8000)
set2_llm = filtered_multi_train_df_llm.sample(n=8000)
set2_hum = filtered_multi_train_df_hum.sample(n=8000)
set3_llm = filtered_multi_train_df_llm.sample(n=8000)
set3_hum = filtered_multi_train_df_hum.sample(n=8000)

In [17]:
set1 = pd.concat([set1_llm, set1_hum])
set2 = pd.concat([set2_llm, set2_hum])
set3 = pd.concat([set3_llm, set3_hum])

In [18]:
print(set1.shape)
print(set1.head())

(16000, 3)
                                                     text label     source
13938   The 2017 Jackson State Tigers football team re...   LLM  wikipedia
37242   We present the relationship between molecular ...   LLM      arxiv
170677   The Romans did have a concept of technologica...   LLM     reddit
111403  Inspect your sink., Obtain a replacement fauce...   LLM    wikihow
21366   Living conditions in the country side in the 1...   LLM     reddit


In [19]:
print(set2.shape)
print(set2.head())

(16000, 3)
                                                     text label    source
8159    Organizing events can seem like a daunting tas...   LLM   wikihow
101590  \n\nThis paper presents a novel neural network...   LLM  peerread
89334   In this article, we present a novel approach f...   LLM     arxiv
23498   It was too expensive to build on large lots wi...   LLM    reddit
84999   The United States has had only one written con...   LLM    reddit


In [20]:
print(set3.shape)
print(set3.head())

(16000, 3)
                                                     text label     source
31767   Chiral symmetry and the string description of ...   LLM      arxiv
72723   The Fitzgerald-Lorentz contraction is usually ...   LLM      arxiv
170557  \n\nIn this day and age, with all this negativ...   LLM     reddit
14335   Saint Francis of Assisi Catholic School is loc...   LLM  wikipedia
88614   We report the discovery by INTEGRAL/IBIS and S...   LLM      arxiv


In [21]:
translate_and_save(set1, 'ru', 'russian_addon.jsonl')

Translating to ru: 100%|██████████| 16000/16000 [48:40<00:00,  5.48it/s] 


In [21]:
translate_and_save(set2, 'ar', 'arabic_addon.jsonl')

Translating to ar: 100%|██████████| 16000/16000 [44:55<00:00,  5.93it/s] 


In [22]:
translate_and_save(set3, 'de', 'german_addon.jsonl')

Translating to de: 100%|██████████| 16000/16000 [44:52<00:00,  5.94it/s] 


In [17]:
m_chars = 2500

filtered_multi_train_df_llm = filter_dataframe(multi_train_df, m_chars, ["bulgarian"], ['LLM'])
filtered_multi_train_df_hum = filter_dataframe(multi_train_df, m_chars, ["bulgarian"], ['human'])

print(filtered_multi_train_df_llm.shape)
print(filtered_multi_train_df_llm.head())

print(filtered_multi_train_df_hum.shape)
print(filtered_multi_train_df_hum.head())

(4115, 3)
                                                    text label     source
43029  Президентът на Република България Румен Радев ...   LLM  bulgarian
43030  Тази седмица Тереза Мей ще даде начало на проц...   LLM  bulgarian
43033  1 април - ден на хумора и шегата | Радомир\n\n...   LLM  bulgarian
43035  "Нефтохимик" поведе с 2:0 победи на ЦСКА в Суп...   LLM  bulgarian
43036  Общината строи 5-етажен блок за социално слаби...   LLM  bulgarian
(4846, 3)
                                                    text  label     source
45029  "Първо искам да благодаря още веднъж на всички...  human  bulgarian
45030  Снимки към новината Премиерът на Великобритани...  human  bulgarian
45031  Свързани новиниБСП промени дневния ред на държ...  human  bulgarian
45032  Не само здрава уборка, но и буквално къпане на...  human  bulgarian
45033  Сн. Агро Пловдив Денят на шегата се празнува в...  human  bulgarian


In [20]:
bulg_llm = filtered_multi_train_df_llm.sample(n=4000)
bulg_hum = filtered_multi_train_df_hum.sample(n=4000)

In [21]:
bulg_set = pd.concat([bulg_llm, bulg_hum])

In [24]:
print(bulg_set.shape)
print(bulg_set.head())

(20, 3)
                                                    text label     source
43433  Фирмите на „Стройко“ – Труд са известни с прод...   LLM  bulgarian
48662  В Дания, нещо невероятно и вече общеизвестно, ...   LLM  bulgarian
91354  Асеновград напред стъпи и напусна Фонда за лет...   LLM  bulgarian
75218  Мрачният мир на Рачко беше разминат веднага, к...   LLM  bulgarian
44598  Днес е ден за бой… с възглавници | Новините от...   LLM  bulgarian


In [25]:
backtranslate_and_save(bulg_set, source_language='bg', target_language='en', output_file='bulgarian_addon.jsonl')

Backtranslating from bg to en: 100%|██████████| 20/20 [00:07<00:00,  2.74it/s]


In [30]:
m_chars = 1000

filtered_multi_train_df_llm = filter_dataframe(multi_train_df, m_chars, ["chinese"], ['LLM'])
filtered_multi_train_df_hum = filter_dataframe(multi_train_df, m_chars, ["chinese"], ['human'])

print(filtered_multi_train_df_llm.shape)
print(filtered_multi_train_df_llm.head())

print(filtered_multi_train_df_hum.shape)
print(filtered_multi_train_df_hum.head())

(5914, 3)
                                                    text label   source
56931  如果您的网络连接中断，您的角色可能会保留在游戏中的一段时间，具体时间取决于游戏服务器和您的网...   LLM  chinese
56932  很抱歉，我无法完全理解您的问题，因为它似乎涉及到特定的平台或市场。但是如果我理解正确的话，您...   LLM  chinese
56933  很抱歉，作为一个语言模型的人工智能助手，我无法确认您是否真正中奖及如何领取奖品。请注意保护您...   LLM  chinese
56934  血尿酸与肾结石之间的关系比较复杂。一般来说，高尿酸血症是导致尿酸肾结石的主要原因之一，但并非...   LLM  chinese
56935  关于哪种弓比较好，这要根据您的需求和个人喜好来决定。不同的弓有不同的特点，比如长弓射程远但需...   LLM  chinese
(5573, 3)
                                                    text  label   source
58901  我也是传3玩家，你的问题我经常遇到，其实这种问题是很难解决的，问题是GT的总服务器设定为无反...  human  chinese
58902  尊敬的顾客： \r\n   您好！5173客户服务026很高兴为您服务！ \r\n\r\n ...  human  chinese
58903  奖品领取是假的,哪有这么大蛤娜随街跳,天上掉陷饼呀!\r\n(一切以官方网站的信息为好)\r...  human  chinese
58905  根据HF80级弓经验介绍，敏体弓后期比较有前途（也学技能的哦，学了战斗技能后再穿敏体装备）\...  human  chinese
58906  1.NE是没有定空的技能,HM有飞锤,空锁,O有撒网,净化,UD有蛛网,冰冻,就数NE没有,...  human  chinese


In [31]:
ch_llm = filtered_multi_train_df_llm.sample(n=4000)
ch_hum = filtered_multi_train_df_hum.sample(n=4000)

In [32]:
ch_set = pd.concat([ch_llm, ch_hum])

In [35]:
print(ch_set.shape)
print(ch_set.head())

(20, 3)
                                                    text label   source
56931  如果您的网络连接中断，您的角色可能会保留在游戏中的一段时间，具体时间取决于游戏服务器和您的网...   LLM  chinese
57169  "企业法律顾问考试"是中国国家法律职业资格考试(CNOL)的一种考试。CNOL是由中国司法部...   LLM  chinese
62507  ：\n\n1、抑郁和焦虑：穷养的孩子可能会更容易受到挫折，他们可能会更容易感到沮丧，可能会更...   LLM  chinese
61477  .\n\n1、首先，打开“开始”菜单，在搜索框中输入“金山软件基础服务”，找到对应的程序，右...   LLM  chinese
58217  去日本可以买到很多好东西，以下是一些推荐：\n1. 电子产品：日本有非常优秀的电子产品，像是...   LLM  chinese


In [37]:
backtranslate_and_save(ch_set, source_language='zh', target_language='en', output_file='chinese_addon.jsonl')

Backtranslating from zh to en: 100%|██████████| 20/20 [00:06<00:00,  2.96it/s]


In [15]:
m_chars = 2550

filtered_multi_train_df_llm = filter_dataframe(multi_train_df, m_chars, ["indonesian"], ['LLM'])
filtered_multi_train_df_hum = filter_dataframe(multi_train_df, m_chars, ["indonesian"], ['human'])

print(filtered_multi_train_df_llm.shape)
print(filtered_multi_train_df_llm.head())

print(filtered_multi_train_df_hum.shape)
print(filtered_multi_train_df_hum.head())

(2099, 3)
                                                    text label      source
52934  Jakarta, CNN Indonesia -- Pebalap MotoGP legen...   LLM  indonesian
52935  Jakarta, CNN Indonesia -- Ahmad Dhani, persone...   LLM  indonesian
52936  Jakarta, CNN Indonesia -- Mantan Kepala Badan ...   LLM  indonesian
52937  Jakarta, CNN Indonesia -- Prosesor Snapdragon ...   LLM  indonesian
52938  Jakarta, CNN Indonesia – Dewasa ini semakin ba...   LLM  indonesian
(2339, 3)
                                                    text  label      source
54934  Jakarta, CNN Indonesia -- Setelah bergabung de...  human  indonesian
54935  Jakarta, CNN Indonesia -- Musisi Ahmad Dhani m...  human  indonesian
54937  Jakarta, CNN Indonesia -- Perusahaan semikondu...  human  indonesian
54941  Jakarta, CNN Indonesia -- Menteri Pendayagunaa...  human  indonesian
54942  Jakarta, CNN Indonesia -- Liverpool unggul 1-0...  human  indonesian


In [16]:
indo_llm = filtered_multi_train_df_llm.sample(n=2000)
indo_hum = filtered_multi_train_df_hum.sample(n=2000)

In [17]:
indo_set = pd.concat([indo_llm, indo_hum])

In [18]:
print(indo_set.shape)
print(indo_set.tail())

(4000, 3)
                                                    text  label      source
56010  Semua sudah dilakukan, tinggal pasrah dan berd...  human  indonesian
55125  Washington - Sejak diangkat menjadi Presiden A...  human  indonesian
56283  JAKARTA - Kementerian Keuangan (Kemenkeu) menc...  human  indonesian
93028  Jakarta -\n Ketum Gerindra Prabowo Subianto me...  human  indonesian
55090  REPUBLIKA.CO.ID, BANDUNG -- Laga Persib Bandun...  human  indonesian


In [None]:
backtranslate_and_save(indo_set, source_language='id', target_language='en', output_file='indonesian_addon.jsonl')

In [15]:
m_chars = 1600

filtered_multi_train_df_llm = filter_dataframe(multi_train_df, m_chars, ["urdu"], ['LLM'])
filtered_multi_train_df_hum = filter_dataframe(multi_train_df, m_chars, ["urdu"], ['human'])

print(filtered_multi_train_df_llm.shape)
print(filtered_multi_train_df_llm.head())

print(filtered_multi_train_df_hum.shape)
print(filtered_multi_train_df_hum.head())

(2003, 3)
                                                    text label source
49029  این ایف سی وفاق نے پاکستان کے چار صوبوں کو 18 ...   LLM   urdu
49032  لاہور: غریب کی مشکل سان ہوگئی، جب وہ سبزی منڈی...   LLM   urdu
49033  کراچی: پاکستان اسٹاک ایکسچینج میں زبردست تیزی ...   LLM   urdu
49034  پاکستان میں زرعی کاروبار کی بہتری کے لیے بڑی خ...   LLM   urdu
49035  ہوگئی\n\nپیمرا نے 32 سیٹلائٹ ٹی وی چینلز کے لا...   LLM   urdu
(2249, 3)
                                                    text  label source
50934  رواں سال صوبوں کو130ارب یا79فیصدزیادہ رقم جاری...  human   urdu
50935  وفاقی وزیر خزانہ ریونیو اقتصادی امور شماریات ن...  human   urdu
50937  کراچی 92 نیوز غریب کی مشکل ہوئی سان سبزی منڈی ...  human   urdu
50938  پاکستان اسٹاک ایکسچینج میں کاروباری ہفتے کے پا...  human   urdu
50939  اسلام اباد 17 جنوری 2019 وزیراعظم عمران خان سے...  human   urdu


In [16]:
urdu_llm = filtered_multi_train_df_llm.sample(n=2000)
urdu_hum = filtered_multi_train_df_hum.sample(n=2000)

In [17]:
urdu_set = pd.concat([urdu_llm, urdu_hum])

In [18]:
print(urdu_set.shape)
print(urdu_set.head())

(4000, 3)
                                                    text label source
50282  واشنگٹن: ٹوئٹر نے امریکی صدر ڈونلڈ ٹرمپ کے اکا...   LLM   urdu
91844  فواد خان پریانکا چوپڑا کے مدمقابل: کون بہتر ہے...   LLM   urdu
49542  شفقت امانت علی نے بھارت میں پاکستانی فنکاروں پ...   LLM   urdu
49439  ایک نئی فلم کا پہلا ٹریلر منظر عام پر آگیا ہے ...   LLM   urdu
50401  اسپاٹ فکسنگ کیس پر نظرثانی کا فیصلہ\n\nسپریم ک...   LLM   urdu


In [19]:
backtranslate_and_save(urdu_set, source_language='ur', target_language='en', output_file='urdu_addon.jsonl')

Backtranslating from ur to en: 100%|██████████| 4000/4000 [20:13<00:00,  3.30it/s]
