In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import chardet
import nltk
import re
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import SnowballStemmer
from nltk import bigrams
from nltk.util import ngrams
from nltk.tokenize import word_tokenize 
from langdetect import detect, LangDetectException
from collections import Counter

In [3]:
# Laden Sie Ihre Daten
df = pd.read_csv('../data/raw/gpd_v2_20220427.csv', delimiter=',', encoding='utf-8')

In [8]:

import os
# Teilen Sie die Daten in populistisch und nicht populistisch
print(df.columns)

populist_files = df[df['rubricgrade'] == 2]['merging_variable']
non_populist_files = df[df['rubricgrade'] == 0]['merging_variable']

# Verzeichnis mit Ihren Textdateien
text_files_directory = 'C:\\Users\\furka\\OneDrive\\Documents\\Studium\\Adv.ML-Project\\data\\raw\\speeches_20220427\\'

# Funktion zum Lesen der Dateiinhalte
def read_file_contents(filename):
    filepath = os.path.join(text_files_directory, filename)
    
    if os.path.isfile(filepath):
        with open(filepath, 'rb') as file:
            rawdata = file.read()
        encoding = chardet.detect(rawdata)['encoding']
        with open(filepath, 'r', encoding=encoding) as file:
            return file.read().replace('\n', '')

    else:
        return None  # oder ein leerer String '', je nachdem, was in Ihrem Fall am besten passt


# Entferne alle Zeilen mit fehlenden Werten in 'merging_variable'
df = df.dropna(subset=['merging_variable'])

# Hinzufügen der Dateiinhalte zu Ihrem DataFrame
df['file_contents'] = df['merging_variable'].apply(read_file_contents)


# Speichere den bereinigten Datensatz
df.to_csv('../data/processed/gpd_processed.csv', index=False)





Index(['merging_variable', 'country', 'leader', 'party', 'lr', 'president',
       'term', 'startofterm', 'yearbegin', 'endofterm', 'yearend',
       'speechtype', 'speechnum', 'codernum', 'rubricgrade', 'averagerubric',
       'totalaverage', 'wb_region', 'region', 'file_contents'],
      dtype='object')


In [25]:
nltk.download('stopwords')

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

df = pd.read_csv('..\data\processed\gpd_processed.csv')

# Berücksichtige nur Zeilen mit "averagerubric" größer als 1.5
df = df.query("averagerubric > 1.5")

# Bereinigen Sie Ihre Daten
df['cleaned_text'] = df['file_contents'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Entfernen von Satzzeichen
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'\d+', '', x))  # Entfernen von Zahlen

# Füge eine neue Spalte hinzu, die die Sprache des Textes enthält
df['language'] = df['cleaned_text'].apply(detect_language)

# Zeige die Verteilung der Sprachen an
print(df['language'].value_counts())

# Bereinigen Sie Ihre Daten und entfernen Sie Stoppwörter, getrennt nach Sprachen
stopwords_multilang = {
    'en': set(stopwords.words('english')),
    'de': set(stopwords.words('german')),
    'fr': set(stopwords.words('french')),
    # Fügen Sie hier weitere Sprachen hinzu, wenn nötig
}

def remove_stopwords(text, lang):
    return ' '.join([word for word in text.split() if word.lower() not in stopwords_multilang[lang]])

for lang in stopwords_multilang.keys():
    df.loc[df['language'] == lang, 'cleaned_text'] = df.loc[df['language'] == lang, 'cleaned_text'].apply(lambda x: remove_stopwords(x, lang))


def get_top_bigrams(texts, n=30):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(texts)
    bag_of_words = vec.transform(texts)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

for lang in stopwords_multilang.keys():
    texts = df.loc[df['language'] == lang, 'cleaned_text']
    if texts.empty:
        print(f"No texts for language {lang}")
    else:
        top_bigrams = get_top_bigrams(texts)
        print(f'Top bigrams for {lang}: {top_bigrams}')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\furka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


language
es    120
en     20
ru     18
tr     16
mk     10
et      8
cs      8
ro      8
sk      8
tl      8
hr      4
hu      4
it      4
fr      4
bg      4
uk      4
Name: count, dtype: int64
Top bigrams for en: [('islamic state', 60), ('brothers sisters', 44), ('islamic world', 40), ('islamic revolution', 32), ('ji sri', 28), ('imam khomeini', 28), ('years ago', 28), ('uttar pradesh', 24), ('free india', 24), ('come back', 24), ('tseh lee', 24), ('mr lien', 24), ('would like', 20), ('dear friends', 20), ('past years', 20), ('religious popularity', 20), ('palestinian state', 20), ('yuan tseh', 20), ('first time', 16), ('political parties', 16), ('government delhi', 16), ('five years', 16), ('become pm', 16), ('want tell', 16), ('want ask', 16), ('youth country', 16), ('thank god', 16), ('friends islamic', 16), ('interest rates', 16), ('four years', 16)]
No texts for language de
Top bigrams for fr: [('discours lalbisgüetli', 104), ('mesdames messieurs', 60), ('présidente confédératio