# DAIGT-V4 Dataset Analysis

The goal is to analyze the text available at DAIGT-V4 dataset. To do so, we load the corpora and perform preprocessing that includes cleaning, chunking and features extraction. After that we perform PCA.

In [1]:
%load_ext autoreload   
%autoreload 2

from src import *
import dill
import nltk
import os
nltk.download('all')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\piotr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\piotr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\piotr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\piotr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\piotr\AppData\Roaming\nltk_data...
[

True

In [2]:
settings = Settings()

In [3]:
dump_dict = None
with open(settings.paths.ws_dataset_dump_file, 'rb') as file:
    dump_dict = dill.load(file)

writing_style_feature_extractor: FeatureExtractor = dump_dict['feature_extractor']
writing_style_pca_analysis_results = dump_dict['pca_analysis_results']

In [4]:
dataset = DaigtDataset(settings)
dataset.load()

<src.datasets.daigt.daigt_dataset.DaigtDataset at 0x2588bb70050>

In [5]:
dataset.csv["source"].unique()

array(['persuade_corpus', 'mistral7binstruct_v2', 'llama_70b_v1',
       'chat_gpt_moth', 'darragh_claude_v7', 'darragh_claude_v6',
       'mistral7binstruct_v1', 'llama2_chat', 'falcon_180b_v1',
       'radek_500', 'NousResearch/Llama-2-7b-chat-hf',
       'mistralai/Mistral-7B-Instruct-v0.1', 'palm-text-bison1',
       'cohere-command', 'radekgpt4', 'train_essays', 'kingki19_palm',
       'Intel-neural-chat-7b-v3-1_LLMEssays_v1', 'Mistral7B_CME_v7',
       'llama_falcon_v3_llama_70b', 'llama_falcon_v3_falcon_180b',
       'nima_gpt4', 'text-ada-001', 'text-babbage-001', 'text-curie-001',
       'text-davinci-001', 'text-davinci-002', 'text-davinci-003',
       'persuade_finetuned_llamas'], dtype=object)

In [6]:
dataset.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,model
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,human
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,human
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,human
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,human
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,human


In [7]:
dataset.clean()

In [8]:
print(f"Text removed while cleaning: {DaigtMetadataAnalysis.get_percentage_of_removed_text(dataset)}%.")

Text removed while cleaning: 6.082373628842047%.


In [9]:
original_number_of_texts, difference, percantage_difference = DaigtMetadataAnalysis.get_number_of_removed_rows(dataset)
print(f"Number of texts removed while cleaning: {difference} out of {original_number_of_texts} - {percantage_difference}%.")

Number of texts removed while cleaning: 4490 out of 73573 - 6.102782270669947%.


In [10]:
preprocessing_results = DaigtPreprocessing(settings).preprocess(dataset)
preprocessing_results.info()

Unnamed: 0,collection,num_of_syllabes,num_of_complex_words,num_of_sentences,num_of_words,num_of_splits
0,human,14120380,969221,483109,9813375,10769987
1,mistral,10682516,1303925,285320,6070966,6778864
2,llama,8816281,892279,291480,5503243,6170227
3,gpt,2101934,245191,61009,1219763,1368287
4,claude,1130911,140517,35002,621156,686641
5,falcon,3024208,377543,84006,1707344,1908469
6,palm,1081325,109961,42883,678476,771607
7,cohere,187627,20141,5956,116050,128760
8,ada,213987,19734,7970,139634,153940
9,babbage,207042,20278,7496,133039,146166


In [12]:
metrics_analysis_results = DaigtMetricsAnalysis().analyze(preprocessing_results)

In [13]:
writing_style_feature_extractor.get_feature_names()

['chunk_id',
 'source_name',
 'author_name',
 'collection_name',
 'unique_word_count',
 'average_word_length',
 'average_sentence_length',
 'average_syllables_per_word',
 'flesch_reading_ease',
 'flesch_kincaid_grade_level',
 'gunning_fog_index',
 'yules_characteristic_k',
 'herdans_c',
 'maas',
 'simpsons_index',
 '"',
 ',',
 ';',
 '?',
 '-',
 "'",
 '.',
 'he',
 'as',
 'it',
 'is',
 'was',
 'his',
 'and',
 'my',
 'that',
 'our',
 'had',
 'a',
 'in',
 'she',
 'said',
 'their',
 'her',
 'with',
 'of',
 'its',
 'the',
 'you',
 'to']

In [14]:
pca_analysis = DaigtPCAAnalysis(
    settings=settings,
    writing_style_feature_extractor=writing_style_feature_extractor,
    writing_style_pca_analysis_results=writing_style_pca_analysis_results
)
pca_results = pca_analysis.analyze(metrics_analysis_results)

In [15]:
DaigtPCAVisualization(pca_results).visualize()

In [16]:
if os.path.exists(settings.paths.daigt_dataset_dump_file):
    os.remove(settings.paths.daigt_dataset_dump_file)
with open(settings.paths.daigt_dataset_dump_file, 'wb') as file:
    dill.dump(
        obj={
            "dataset": dataset,
            "preprocessing_results": preprocessing_results,
            "metrics_analysis_results": metrics_analysis_results,
            "feature_extractor": writing_style_feature_extractor,
            "pca_results": pca_results,
        },  
        file=file
    )