In [56]:
import stanza
import stopwordsiso as stopwords

languages = ['de', 'en', 'uk', 'es', 'nl', 'ca', 'ru', 'pt', 'ar', 'zh', 'cs']
for lang in languages:
    stanza.download(lang)

# Initialize Stanza pipelines for required languages
stanza_pipelines = {
    'de': stanza.Pipeline('de'),
    'en': stanza.Pipeline('en'),
    'uk': stanza.Pipeline('uk'),
    'es': stanza.Pipeline('es'),
    'nl': stanza.Pipeline('nl'),
    'ca': stanza.Pipeline('ca'),
    'ru': stanza.Pipeline('ru'),
    'pt': stanza.Pipeline('pt'),
    'ar': stanza.Pipeline('ar'),
    'zh': stanza.Pipeline('zh'),
    'cs': stanza.Pipeline('cs'),
}

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 4.31MB/s]                    
2024-06-07 01:18:35 INFO: Downloaded file to C:\Users\27gur\stanza_resources\resources.json
2024-06-07 01:18:35 INFO: Downloading default packages for language: de (German) ...
2024-06-07 01:18:37 INFO: File exists: C:\Users\27gur\stanza_resources\de\default.zip
2024-06-07 01:18:42 INFO: Finished downloading models and saved to C:\Users\27gur\stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 3.40MB/s]                    
2024-06-07 01:18:43 INFO: Downloaded file to C:\Users\27gur\stanza_resources\resources.json
2024-06-07 01:18:43 INFO: Downloading default packages for language: en (English) ...
2024-06-07 01:18:44 INFO: File exists: C:\Users\27gur\stanza_resources\en\default.zip
2024-06-07 01:18:47 INFO: Finished downloading models and saved to C:\Users\27gur\

In [57]:
# stylometric feature extraction

# Lexical Features
# word count
class StylometricFeatures:
    def __init__(self, text, lang):
        self.text = text
        self.lang = lang
        self.doc = stanza_pipelines[lang](text)

    def word_count(self):
        return len([word.text for sent in self.doc.sentences for word in sent.words])
    
    # unique word count
    def unique_word_count(self):
        return len(set([word.text for sent in self.doc.sentences for word in sent.words]))
    
    # character count excluding spaces
    def char_count(self):
        return len([char for sent in self.doc.sentences for word in sent.words for char in word.text])
    
    # average word length
    def avg_word_length(self):
        return self.char_count() / self.word_count()
    
    # ttr type token ratio
    def ttr(self):
        return self.unique_word_count() / self.word_count()
    
    # Hapax Legomenon rate
    def hapax_legomenon(self):
        word_freq = {}
        for sent in self.doc.sentences:
            for word in sent.words:
                if word.text in word_freq:
                    word_freq[word.text] += 1
                else:
                    word_freq[word.text] = 1
        return len([word for word in word_freq if word_freq[word] == 1]) / self.word_count()
    
    # sentence count
    def sentence_count(self):
        return len(self.doc.sentences)
    
    # average sentence length
    def avg_sentence_length(self):
        return self.word_count() / self.sentence_count()
    
    # average sentence complexity
    def avg_sentence_complexity(self):
        return self.sentence_count() / self.word_count()
    
    # count of punctuations
    def punctuation_count(self):
        return len([word.text for sent in self.doc.sentences for word in sent.words if word.upos == 'PUNCT'])
    
    #count nouns
    def noun_count(self):
        return len([word.text for sent in self.doc.sentences for word in sent.words if word.upos == 'NOUN'])
    
    #count stopwords
    def stopword_count(self):
        stop_words = stopwords.stopwords(self.lang)
        return len([word.text for sent in self.doc.sentences for word in sent.words if word.text in stop_words])
    
    #count verbs
    def verb_count(self):
        return len([word.text for sent in self.doc.sentences for word in sent.words if word.upos == 'VERB'])
    
    #count adjectives
    def adj_count(self):
        return len([word.text for sent in self.doc.sentences for word in sent.words if word.upos == 'ADJ'])
    
    #count adverbs
    def adv_count(self):
        return len([word.text for sent in self.doc.sentences for word in sent.words if word.upos == 'ADV'])
    
    #count complex sentences
    def comlex_sentence_count(self):
        complex_sentences = 0
        for sent in self.doc.sentences:
            clause_count = sum(1 for word in sent.words if word.deprel == 'conj' or word.deprel == 'advcl')
            if clause_count > 0:
                complex_sentences += 1
        return complex_sentences


In [58]:
import pandas as pd
df = pd.read_csv('dataset\multitude.csv')

df.head()

Unnamed: 0,text,label,multi_label,split,language,length,source
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews


In [59]:
samples_per_language = 5
samples = df.groupby('language').apply(lambda x: x.sample(samples_per_language))


In [60]:
# print stylometric features for each sample

for index, row in samples.iterrows():

    text = row['text']
    lang = row['language']
    print(f'\n  {lang}  \n')
    print(text)
    features = StylometricFeatures(text, lang)
    print(f'Word Count: {features.word_count()}')
    print(f'Unique Word Count: {features.unique_word_count()}')
    print(f'Character Count: {features.char_count()}')
    print(f'Average Word Length: {features.avg_word_length()}')
    print(f'Type Token Ratio: {features.ttr()}')
    print(f'Hapax Legomenon Rate: {features.hapax_legomenon()}')
    print(f'Sentence Count: {features.sentence_count()}')
    print(f'Average Sentence Length: {features.avg_sentence_length()}')
    print(f'Average Sentence Complexity: {features.avg_sentence_complexity()}')
    print(f'Punctuation Count: {features.punctuation_count()}')
    print(f'Noun Count: {features.noun_count()}')
    print(f'Stopword Count: {features.stopword_count()}')
    print(f'Verb Count: {features.verb_count()}')
    print(f'Adjective Count: {features.adj_count()}')
    print(f'Adverb Count: {features.adv_count()}')
    print(f'Complex Sentence Count: {features.comlex_sentence_count()}')


  ar  

في خبر عاجل من العراق، وصل زعماء المعارضة العراقية إلى مناطق الأكراد شمال العراق في محاولة لتعزيز التعاون بين الجانبين وتوحيد الجهود لمواجهة التحديات الأمنية والسياسية التي تواجه البلاد.ويأتي هذا اللقاء المهم بين قادة المعارضة العراقية وممثلين عن الحكومة الإقليمية الكردستانية، في ظل تصاعد التوترات بين بغداد وأربيل على خلفية الاستفتاء على استقلال كردستان ومشاركة القوات الكردية في عمليات استعادة السيطرة على الموصل وكركوك من قبضة تنظيم الدولة الإسلامية.وأكد الزعماء خلال اللقاء على ضرورة تعزيز التوافق بين جميع الأطراف العراقية والعمل المشترك لحل الخلافات وتعزيز الاستقرار في العراق وتحقيق التقدم في مواجهة الجماعات المتطرفة.
Word Count: 116
Unique Word Count: 79
Character Count: 533
Average Word Length: 4.594827586206897
Type Token Ratio: 0.6810344827586207
Hapax Legomenon Rate: 0.5603448275862069
Sentence Count: 3
Average Sentence Length: 38.666666666666664
Average Sentence Complexity: 0.02586206896551724
Punctuation Count: 5
Noun Count: 57
Stopword Count: 36
Verb Count: 4
Adjectiv