This is an experiment to extend the original dataset using the c4 dataset.

In [1]:
from datasets import load_dataset

en = load_dataset("allenai/c4", "en", split="train", streaming=True)
ru = load_dataset("allenai/c4", "ru", split="train", streaming=True)
es = load_dataset("allenai/c4", "es", split="train", streaming=True)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from tqdm import tqdm
import json

# take 20000 samples from en which have less than 200 words
en_samples = []
for sample in tqdm(en):
    if len(sample["text"].split()) < 300:
        en_samples.append(sample["text"])
    if len(en_samples) == 20000:
        break

33946it [00:20, 1654.76it/s]


In [3]:
# take 7000 samples from ru which have less than 200 words
ru_samples = []
for sample in tqdm(ru):
    if len(sample["text"].split()) < 300:
        ru_samples.append(sample["text"])
    if len(ru_samples) == 9000:
        break

15047it [00:27, 548.44it/s] 


In [4]:
# take 7000 samples from es which have less than 200 words
es_samples = []
for sample in tqdm(es):
    if len(sample["text"].split()) < 300:
        es_samples.append(sample["text"])
    if len(es_samples) == 9000:
        break

19149it [00:07, 2631.96it/s]


In [5]:
# get the last 2000 samples from each language
en_samples = en_samples[-2000:]
ru_samples = ru_samples[-2000:]
es_samples = es_samples[-2000:]

In [16]:
import re
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    # remove urls
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # remove emails
    text = re.sub(r"\S+@\S+", "", text, flags=re.MULTILINE)
    # remove whitespaces
    text = re.sub(r"\s+", " ", text)
    # remove \n
    text = re.sub(r"\n", "", text)
    return text

In [25]:
import pandas as pd
languages = ['de', 'en', 'uk', 'es', 'nl', 'ca', 'ru', 'pt', 'ar', 'zh', 'cs']
new_test_df = pd.DataFrame(columns=["text", "label", "multi_label", "split", "language", "length", "source" ])
# get 2000 samples from each language and create a dataframe
for lang in languages:
    dataset = load_dataset("allenai/c4", lang, split="validation", streaming=True)
    print(lang)
    samples = []
    for sample in tqdm(dataset):
        sample = preprocess_text(sample["text"])
        if len(sample.split()) < 300 and len(sample.split()) > 100:
            samples.append(sample)
        if len(samples) == 2000:
            break
    all = []
    for sample in samples:
        all.append([sample, 0, "human", "test", lang, len(sample.split()), "c4"])
    df = pd.DataFrame(all, columns=["text", "label", "multi_label", "split", "language", "length", "source" ])
    new_test_df = pd.concat([new_test_df, df])


de


6379it [00:07, 903.78it/s] 


en


5552it [00:03, 1559.43it/s]


uk


5444it [00:06, 798.57it/s] 


es


6833it [00:07, 908.15it/s] 


nl


5523it [00:05, 1012.13it/s]


ca


6787it [00:10, 674.35it/s] 


ru


5985it [00:07, 832.79it/s] 


pt


5960it [00:03, 1737.65it/s]


ar


6301it [00:07, 793.43it/s] 


zh


19794it [00:28, 683.98it/s] 


cs


5961it [00:03, 1611.89it/s]


In [26]:
# check for null values
print(new_test_df.isnull().sum())

# drop null values
new_test_df = new_test_df.dropna()

text           0
label          0
multi_label    0
split          0
language       0
length         0
source         0
dtype: int64


In [27]:
new_test_df.to_csv("dataset/c4_test.csv", index=False)

In [28]:
final_dataset = pd.concat([new_test_df, pd.read_csv("dataset/multic4.csv")])
final_dataset.to_csv("dataset/multic4-new.csv", index=False)

In [30]:
train_df = final_dataset[final_dataset["split"] == "train"]
test_df = final_dataset[final_dataset["split"] == "test"]

# number of samples as label 0 or 1 in test and train
print("test_df", test_df['label'].value_counts())
print("train_df", train_df['label'].value_counts())

# graph of the number of samples as label 0 or 1 in test and train
import matplotlib.pyplot as plt

test_df['label'].value_counts().plot(kind='bar')
plt.title('class distribution in test set')
plt.show()

train_df['label'].value_counts().plot(kind='bar')
plt.title('class distribution in train set')
plt.show()

test_df label
1    26059
0    25236
Name: count, dtype: int64
train_df label
1    40030
0    38753
Name: count, dtype: int64


ModuleNotFoundError: No module named 'matplotlib'

In [22]:
new_test_df['length'].mean()

257.41263636363635

In [32]:
# average number of words
en_words = sum([len(x.split()) for x in en_samples])
ru_words = sum([len(x.split()) for x in ru_samples])
es_words = sum([len(x.split()) for x in es_samples])

en_words = en_words / len(en_samples)
ru_words = ru_words / len(ru_samples)
es_words = es_words / len(es_samples)

print(en_words, ru_words, es_words)

127.3187 125.82485714285714 144.264


In [33]:
import pandas as pd

df_original = pd.read_csv("dataset/dataset_all.csv")

In [34]:
df_original.head()

Unnamed: 0,text,label,multi_label,split,language,length,source,word_count,unique_word_count,char_count,...,question_mark_count,exclamation_mark_count,flesch_reading_ease,gunning_fog_index,first_person_pronoun_count,person_entity_count,date_entity_count,uniqueness_bigram,uniqueness_trigram,syntax_variety
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel,199.0,118.0,1067.0,...,0.0,0.0,-272.02217,11.15603,0.0,0.0,0.0,0.90404,0.979695,12.0
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews,70.0,54.0,311.0,...,0.0,1.0,-186.793214,8.714286,0.0,5.0,2.0,1.0,1.0,11.0
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax,130.0,82.0,691.0,...,0.0,0.0,-269.236538,11.015385,0.0,0.0,0.0,0.860465,0.929688,14.0
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews,292.0,149.0,1419.0,...,0.0,0.0,-231.229869,11.4401,1.0,1.0,1.0,0.876289,0.965517,13.0
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews,476.0,242.0,2259.0,...,0.0,0.0,-224.855788,13.160504,1.0,2.0,2.0,0.871579,0.974684,15.0


In [36]:
new_df = pd.DataFrame(columns=["text", "label", "multi_label", "split", "language", "length", "source" ])

In [47]:
# add en samples
all_en = []
for sample in en_samples:
    all_en.append([sample, 0, "human", "train", "en", len(sample.split()), "c4"])

# add ru samples
all_ru = []
for sample in ru_samples:
    all_ru.append([sample, 0, "human", "train", "ru", len(sample.split()), "c4"])

# add es samples
all_es = []
for sample in es_samples:
    all_es.append([sample, 0, "human", "train", "es", len(sample.split()), "c4"])

In [48]:
df_en = pd.DataFrame(all_en, columns=["text", "label", "multi_label", "split", "language", "length", "source"])
df_es = pd.DataFrame(all_es, columns=["text", "label", "multi_label", "split", "language", "length", "source"])
df_ru = pd.DataFrame(all_ru, columns=["text", "label", "multi_label", "split", "language", "length", "source"])

new_df = pd.concat([df_en, df_ru, df_es], ignore_index=True)

In [49]:
new_df

Unnamed: 0,text,label,multi_label,split,language,length,source
0,Beginners BBQ Class Taking Place in Missoula!\...,0,human,train,en,130,c4
1,Foil plaid lycra and spandex shortall with met...,0,human,train,en,29,c4
2,How many backlinks per day for new site?\nDisc...,0,human,train,en,187,c4
3,The Denver Board of Education opened the 2017-...,0,human,train,en,164,c4
4,BANGALORE CY JUNCTION SBC to GONDIA JUNCTION G...,0,human,train,en,63,c4
...,...,...,...,...,...,...,...
33995,Neumáticos 315/30 R18 » ENTREGA GRATIS » Opone...,0,human,train,es,68,c4
33996,Carlos Morales archivos - Seguros TV Blog Segu...,0,human,train,es,83,c4
33997,Aniversarios de Empresa Castro-Urdiales - Mari...,0,human,train,es,140,c4
33998,04 de July del 2017 a las 21:29 -\nTres comisi...,0,human,train,es,233,c4


In [50]:
# preproces the text in the new_df
import re

def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    # remove urls
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # remove emails
    text = re.sub(r"\S+@\S+", "", text, flags=re.MULTILINE)
    # remove whitespaces
    text = re.sub(r"\s+", " ", text)
    # remove \n
    text = re.sub(r"\n", "", text)
    return text

new_df["text"] = new_df["text"].apply(preprocess_text)

In [51]:
# update the length column
new_df["length"] = new_df["text"].apply(lambda x: len(x.split()))

In [52]:
new_df

130.41838235294117


In [24]:
new_df['length'].mean()

NameError: name 'new_df' is not defined

In [54]:
multitude = pd.read_csv("dataset/multitude.csv")
multic4 = pd.concat([multitude, new_df], ignore_index=True)

In [64]:
new_df.to_csv("dataset/c4train.csv", index=False)

In [56]:
# count nan values
print(multic4.isnull().sum())

text           0
label          0
multi_label    0
split          0
language       0
length         0
source         0
dtype: int64


In [57]:
multic4.to_csv("dataset/multic4.csv", index=False)

In [23]:
# check for none values
multic4 = pd.read_csv("dataset/multic4.csv")

print(multic4.isnull().sum())
# check for duplicates
print(multic4.duplicated().sum())


text           0
label          0
multi_label    0
split          0
language       0
length         0
source         0
dtype: int64
0


In [60]:
# remove null values
multic4 = multic4.dropna()
# remove duplicates
multic4 = multic4.drop_duplicates()
print(multic4.isnull().sum())


text           0
label          0
multi_label    0
split          0
language       0
length         0
source         0
dtype: int64


In [61]:
multic4.head()

Unnamed: 0,text,label,multi_label,split,language,length,source
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews


In [62]:
multic4.to_csv("dataset/multic4.csv", index=False)

In [63]:
multic4

Unnamed: 0,text,label,multi_label,split,language,length,source
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews
...,...,...,...,...,...,...,...
108076,neumáticos 315/30 r18 » entrega gratis » opone...,0,human,train,es,68,c4
108077,carlos morales archivos - seguros tv blog segu...,0,human,train,es,83,c4
108078,aniversarios de empresa castro-urdiales - mari...,0,human,train,es,140,c4
108079,04 de july del 2017 a las 21:29 - tres comisio...,0,human,train,es,233,c4


In [None]:
results.at[index, 'word_count'] = features.word_count
    results.at[index, 'unique_word_count'] = features.unique_word_count
    results.at[index, 'char_count'] = features.char_count
    results.at[index, 'avg_word_length'] = features.avg_word_length
    results.at[index, 'ttr'] = features.ttr
    results.at[index, 'hapax_legomenon'] = features.hapax_legomenon
    results.at[index, 'sentence_count'] = features.sentence_count
    results.at[index, 'avg_sentence_length'] = features.avg_sentence_length
    results.at[index, 'avg_sentence_complexity'] = features.avg_sentence_complexity
    results.at[index, 'punctuation_count'] = features.punctuation_count
    results.at[index, 'noun_count'] = features.noun_count
    results.at[index, 'stopword_count'] = features.stopword_count
    results.at[index, 'verb_count'] = features.verb_count
    results.at[index, 'adj_count'] = features.adjective_count
    results.at[index, 'adv_count'] = features.adverb_count
    results.at[index, 'complex_sentence_count'] = features.complex_sentence_count
    results.at[index, 'question_mark_count'] = features.question_mark_count
    results.at[index, 'exclamation_mark_count'] = features.exclamation_mark_count
    results.at[index, 'flesch_reading_ease'] = features.flesch_reading_ease
    results.at[index, 'gunning_fog_index'] = features.gunning_fog_index
    results.at[index, 'first_person_pronoun_count'] = features.first_person_pronoun_count
    results.at[index, 'person_entity_count'] = features.person_entity_count
    results.at[index, 'date_entity_count'] = features.date_entity_count
    uniqueness_bigram_val, uniqueness_trigram_val = features.calculate_uniqueness_stanza
    results.at[index, 'uniqueness_bigram'] = uniqueness_bigram_val
    results.at[index, 'uniqueness_trigram'] = uniqueness_trigram_val
    results.at[index, 'syntax_variety'] = features.calculate_syntax_variety

In [None]:
# check for null values

