In [1]:
import pandas as pd
import string
import statistics
import numpy as np

#import stanza
# import pickle

from tqdm import tqdm
from keybert import KeyBERT
from wordcloud import WordCloud

from textblob import TextBlob
from textblob_nl import PatternAnalyzer

import textstat

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels

In [2]:
save_results_to = 'results/'

In [3]:
eng_train_meta = pd.read_csv('eng/train/abortion_train_metadata.csv')
eng_train_path = 'eng/train/'
eng_test_meta = pd.read_csv('eng/test/abortion_test_metadata.csv')
eng_test_path = 'eng/test/'

nld_train_meta = pd.read_csv('nld/train/abortus_train_metadata.csv').rename(columns={"content": "section"})
nld_train_path = 'nld/train/'
nld_test_meta = pd.read_csv('nld/test/abortus_test_metadata.csv').rename(columns={"content": "section"})
nld_test_path = 'nld/test/'

In [4]:
def get_all_article(path):
    import os,glob
    folder_path = path
    articles = []
    for filename in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(filename, 'r') as f:
            text = f.read()
            articles.append(text)
    return articles

In [5]:
eng_train_articles = get_all_article(eng_train_path)
eng_test_articles = get_all_article(eng_test_path)

nld_train_articles = get_all_article(nld_train_path)
nld_test_articles = get_all_article(nld_test_path)

In [23]:
def avg_article_len(path):
    import os,glob
    folder_path = path
    filenum = 0
    wordcount = 0
    for filename in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(filename, 'r') as f:
            text = f.read()
            filenum += 1
            wordcount += len(text)
    return wordcount/filenum

In [27]:
print("Average article length of ENG train: ", avg_article_len(eng_train_path))
print("Average article length of ENG test: ", avg_article_len(eng_test_path))
print("Average article length of NLD train: ", avg_article_len(nld_train_path))
print("Average article length of NLD test: ", avg_article_len(nld_test_path))

Average article length of ENG train:  7483.064583333334
Average article length of ENG test:  7507.591666666666
Average article length of NLD train:  3609.5291666666667
Average article length of NLD test:  3809.875


In [29]:
print("Average title length of ENG train: ", eng_train_meta['title'].apply(len).mean())
print("Average title length of ENG test: ", eng_test_meta['title'].apply(len).mean())
print("Average title length of NLD train: ", nld_train_meta['title'].apply(len).mean())
print("Average title length of NLD test: ", nld_test_meta['title'].apply(len).mean())

Average title length of ENG train:  79.89583333333333
Average title length of ENG test:  79.675
Average title length of NLD train:  63.545833333333334
Average title length of NLD test:  62.8


In [6]:
def process_articles(articles, language):
    nlp = stanza.Pipeline(lang=language, processors='tokenize,pos,lemma')
    processed_articles = []
    print("Using stanza to process articles (it may take long...)")
    for article in tqdm(articles):
        processed_articles.append(nlp.process(article))
    return processed_articles

In [7]:
def extract_keyword(articles, language):
    eng_kw_model = KeyBERT()
    nld_kw_model = KeyBERT(model = 'distiluse-base-multilingual-cased-v1')
    kw = []

    if language == 'eng':
        print("Calculating eng keywords")
        for article in tqdm(articles):
            keyword = eng_kw_model.extract_keywords(article)[0][0]
            kw.append(keyword)

    if language == 'nld':
        print("Calculating nld keywords")
        for article in tqdm(articles):
            keyword = nld_kw_model.extract_keywords(article)[0][0]
            kw.append(keyword)
    return kw

In [8]:
def create_wordcloud(keywords, figname):
    wc = WordCloud().generate(str(keywords).replace("'",""))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(save_results_to + figname + '.png', dpi = 300)
    plt.close()

In [9]:
def get_sentiment(articles, language):
    polarity = []
    subjectivity = []
    readability = []
    
    if language == 'eng':
        for article in articles:
            polarity.append(TextBlob(article).sentiment.polarity)
            subjectivity.append(TextBlob(article).sentiment.subjectivity)
            readability.append(textstat.flesch_reading_ease(article))

    if language == 'nld':
        for article in articles:
            polarity.append(TextBlob(text=article, analyzer=PatternAnalyzer()).sentiment[0])
            subjectivity.append(TextBlob(text=article, analyzer=PatternAnalyzer()).sentiment[1])
            readability.append(textstat.flesch_reading_ease(article))

    return polarity, subjectivity, readability

In [10]:
def create_df(meta, articles, language):
    df = pd.DataFrame(articles, columns=['content'])
    df['section'] = meta['section']
    df['keyword']= extract_keyword(articles, language)
    polarity, subjectivity, readability = get_sentiment(articles, language)
    df['polarity'] = polarity
    df['subjectivity'] = subjectivity
    df['readability'] = readability
    return df

In [11]:
eng_train_df = create_df(eng_train_meta, eng_train_articles, 'eng')
eng_test_df = create_df(eng_test_meta, eng_test_articles, 'eng')
nld_train_df = create_df(nld_train_meta, nld_train_articles, 'nld')
nld_test_df = create_df(nld_test_meta, nld_test_articles, 'nld')

Calculating eng keywords


100%|█████████████████████████████████████████| 480/480 [02:55<00:00,  2.73it/s]


Calculating eng keywords


100%|█████████████████████████████████████████| 120/120 [00:45<00:00,  2.61it/s]


Calculating nld keywords


100%|█████████████████████████████████████████| 480/480 [04:43<00:00,  1.69it/s]


Calculating nld keywords


100%|█████████████████████████████████████████| 120/120 [01:13<00:00,  1.64it/s]


In [12]:
def separate_nld_country(nld_df):
    nld_country = nld_df.drop(columns=['content'])
    nld_country['section'] = np.where(nld_country['section'] == 'Buitenland', 'Foreign', nld_country['section'])
    nld_country['section'] = np.where(nld_country['section'] == 'Van Trump naar Biden', 'Foreign', nld_country['section'])
    nld_country['section'] = np.where(nld_country['section'] != 'Foreign', 'NL', nld_country['section'])
    
    print("+++ Domestic statistics +++")
    print(nld_country.loc[nld_country['section'] == 'NL'].mean())
    print("+++ Foreign statistics +++")
    print(nld_country.loc[nld_country['section'] == 'Foreign'].mean())
    return nld_country

In [13]:
def separate_eng_country(eng_df):
    eng_country = eng_df.drop(columns=['content'])
    eng_country['section'] = np.where(eng_country['section'] == 'US news', 'US', eng_country['section'])
    eng_country['section'] = np.where(eng_country['section'] != 'US', 'non_US', eng_country['section'])
    
    print("+++ US statistics +++")
    print(eng_country.loc[eng_country['section'] == 'US'].mean())
    print("+++ non_US statistics +++")
    print(eng_country.loc[eng_country['section'] == 'non_US'].mean())
    return eng_country

In [14]:
print("====== eng train statistics ======")
eng_train_country = separate_eng_country(eng_train_df)
print("====== eng test statistics ======")
eng_test_country = separate_eng_country(eng_test_df)

print("====== nld train statistics ======")
nld_train_country = separate_nld_country(nld_train_df)
print("====== nld test statistics ======")
nld_test_country = separate_nld_country(nld_test_df)

+++ US statistics +++
polarity         0.087853
subjectivity     0.430909
readability     51.529031
dtype: float64
+++ non_US statistics +++
polarity         0.085967
subjectivity     0.430381
readability     52.090951
dtype: float64
+++ US statistics +++
polarity         0.095803
subjectivity     0.438832
readability     51.170755
dtype: float64
+++ non_US statistics +++
polarity         0.076595
subjectivity     0.432030
readability     52.150746
dtype: float64
+++ Domestic statistics +++
polarity         0.030810
subjectivity     0.492828
readability     63.888372
dtype: float64
+++ Foreign statistics +++
polarity         0.010575
subjectivity     0.499466
readability     63.535019
dtype: float64
+++ Domestic statistics +++
polarity         0.025776
subjectivity     0.496841
readability     64.821875
dtype: float64
+++ Foreign statistics +++
polarity         0.027839
subjectivity     0.499675
readability     64.633472
dtype: float64


In [15]:
# https://stackoverflow.com/a/50690729
def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)

In [16]:
def create_coorplot(df, figname):
    sns.set(style='white', font_scale=1.6)
    g = sns.PairGrid(df.drop(columns=['section']), aspect=1.4, diag_sharey=False)
    g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
    g.map_diag(sns.histplot, kde_kws={'color': 'black'})
    g.map_upper(corrdot)
    g.savefig(save_results_to + figname + '.png', dpi = 100)
    plt.close()

In [17]:
create_coorplot(eng_train_df, 'eng_train_coor')
create_coorplot(eng_test_df, 'eng_test_coor')
create_coorplot(nld_train_df, 'nld_train_coor')
create_coorplot(nld_test_df, 'nld_test_coor')

In [18]:
def nld_section_and_pie(nld_df, figname, type='train'):
    nld_majors = ['Buitenland','Binnenland','Politiek']
    nld_sections = nld_df.drop(columns=['content'])
    nld_sections.loc[( (nld_sections['section'] != 'Buitenland') & (nld_sections['section'] != 'Binnenland') & (nld_sections['section'] != 'Politiek')), 'section'] = 'Others'
    nld_label_count = dict(nld_sections['section'].value_counts())
    nld_labels = list(nld_label_count.keys())
    nld_values = list(nld_label_count.values())
    plt.pie(nld_values,labels=nld_labels, autopct=lambda p : '{:.2f}% '.format(p,p * sum(nld_values)/100))
    if type == 'train':
        plt.title("nld train set sections")
    if type == 'test':
        plt.title("nld test set sections")
    #plt.show()
    plt.savefig(save_results_to + figname + '.png', dpi = 100)
    plt.close()
    return nld_sections

In [19]:
def eng_section_and_pie(eng_df, figname, type='train'):
    eng_sections = eng_df.drop(columns=['content'])
    eng_sections.loc[( (eng_sections['section'] != 'US news') & (eng_sections['section'] != 'Opinion') & (eng_sections['section'] != 'World news') & (eng_sections['section'] != 'Australia news')), 'section'] = 'Others'
    eng_label_count = dict(eng_sections['section'].value_counts())
    eng_labels = list(eng_label_count.keys())
    eng_values = list(eng_label_count.values())
    plt.pie(eng_values,labels=eng_labels, autopct=lambda p : '{:.2f}% '.format(p,p * sum(eng_values)/100))
    if type == 'train':
        plt.title("eng train set sections")
    if type == 'test':
        plt.title("eng test set sections")
    #plt.show()
    plt.savefig(save_results_to + figname + '.png', dpi = 100)
    plt.close()
    return eng_sections

In [20]:
eng_train_sections = eng_section_and_pie(eng_train_df,'eng_train_sections')
eng_test_sections = eng_section_and_pie(eng_test_df,'eng_test_sections')
nld_train_sections = nld_section_and_pie(nld_train_df,'nld_train_sections')
nld_test_sections = nld_section_and_pie(nld_test_df,'nld_test_sections')

In [21]:
def create_stripplot(section_df, figname):
    s = sns.stripplot(data=section_df, x='section', y='polarity',hue='subjectivity', legend = False)
    _, xlabels = plt.xticks()
    s.set_xticklabels(xlabels, size=10)
    #plt.show()
    plt.savefig(save_results_to + figname + '.png', dpi = 100)
    plt.close()

In [22]:
create_stripplot(eng_train_sections, 'eng_train_strip')

  s.set_xticklabels(xlabels, size=10)
