In [None]:
# !python -m spacy download nl_core_news_sm

In [None]:
import pandas as pd
import numpy as np
import os
import csv
import nltk
import spacy
import random
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import  classification_report, cohen_kappa_score
from sklearn.model_selection import GridSearchCV

from transformers import set_seed
from sklearn.model_selection import train_test_split


In [3]:
# go one level up in the directory
os.chdir("/data/500gbstorage/actor_classification")


huggingface_cache_dir = 'model'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

In [4]:
set_seed(42)
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)

In [5]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('dutch')

[nltk_data] Downloading package stopwords to /home/eklk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load & Prep Data

In [None]:
# load the manually annotated dataset
df = pd.read_csv('data/coded_df_topics_full.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

In [None]:
# articles df includes all article texts, categories and keywords
articles_df = pd.read_csv('data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(articles_df.shape)

# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Title', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)
# change LINE BREAK to \n
articles_df['Text'] = articles_df['Text'].str.replace('[LINE_BREAK]', '\n')

In [8]:
# make an input text, combining Title, Text, Category and Keywords and before Category add string 'Categories: ' and before Keywords add string 'Keywords: ' if Category and Keywords are empty skip them
articles_df['Category'] = articles_df['Category'].fillna('')
articles_df['Keywords'] = articles_df['Keywords'].fillna('')
articles_df['input_text'] = articles_df['Title'] + '\n' + articles_df['Text'] + '\n' + 'Categories: ' + articles_df['Category'] + ' ' + 'Keywords: ' + articles_df['Keywords']

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

# Text preprocessing

In [21]:
print(df['input_text'].values[0])

# create a text preprocessing function where you lowercase the text and then lemmitize the text
def text_lower(text):
    text = text.lower()
    return text

df['input_text_lower'] = df['input_text'].apply(text_lower)
print(df['input_text_lower'].values[0])

Minister roept iedereen op: niet discrimineren om coronavirus 
Minister Bruins voor Medische Zorg vindt het verschrikkelijk dat mensen met een Aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. Hij deed in de Tweede Kamer een oproep aan iedereen om hiertegen op te staan.
Bruins reageerde op vragen van onder anderen GroenLinks-Kamerlid Ellemeet. Zij zei dat mensen met Aziatisch uiterlijk op grote schaal worden gediscrimineerd. Ze hoorde bijvoorbeeld van een meisje dat mensen in de tram hun trui over hun mond trokken toen ze haar zagen.
Ellemeet vroeg de minister of hij zich hierover duidelijk wil uitspreken. Bruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de Tweede Kamer.
De minister zei dat hij het er zeer mee eens is dat dit niet bij een fatsoenlijke samenleving hoort. "Mensen discrimineren gaat niet aan. We moeten ervoor zorgen dat het niet optreedt. Daar hebben wij allemaal een rol in."
Bruins is niet van plan om evenement

In [22]:
nlp = spacy.load("nl_core_news_sm")
lemmatizer = nlp.get_pipe("lemmatizer")

def text_lemmatizer(text):
    text = nlp(text)
    text = ' '.join([token.lemma_ for token in text])
    return text

df['input_text_lemmatized'] = df['input_text_lower'].apply(text_lemmatizer)
print(df['input_text_lemmatized'].values[0])

minister oproepen iedereen op : niet discrimineren om coronavirus 
 minister bruins voor medisch zorg vinden het verschrikkelijk dat mens met een aziatisch uiterlijk worden discrimineren vanwege het coronavirus . hij doen in de Tweede Kamer een oproep aan iedereen om hiertegen op te bijstaan . 
 bruins reageren op vraag van onder ander groenlinks-kamerlid ellemeet . zij zeggen dat mens met aziatisch uiterlijk op groot schaal worden discrimineren . ze horen bijvoorbeeld van een meisje dat mens in de tram hun trui over hun mond trekken toen ze haar zien . 
 ellemeet vragen de minister of hij zich hierover duidelijk willen uitspreken . bruins zeggen hierop dat hij dit de komen dag nog verschillend keer willen doen , ook buiten de Tweede Kamer . 
 de minister zeggen dat hij het er zeer mee eens zijn dat dit niet bij een fatsoenlijk samenleving horen . " mens discrimineren gaan niet aan . we moeten ervoor zorgen dat het niet optreden . daar hebben wij allemaal een rol in . " 
 bruins zijn n

In [None]:
train_df = df[df['reliability_article'] == 0]
test_df = df[df['reliability_article'] == 1]


# TF-IDF + SVM

## Gridsearch

In [35]:
# Select your features and target variable
X = train_df[['input_text_lower']]  # This should remain a DataFrame
y = train_df['about_covid'].values.flatten()  # Convert to 1D array

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print(f"X_train shape: {X_train.shape}")  # Should be (n_samples_train, n_features)
print(f"y_train shape: {y_train.shape}")  # Should be (n_samples_train,)


X_train shape: (540, 1)
y_train shape: (540,)


In [None]:
# Define the parameter grid
param_grid_svc = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [100, 2000, 1000, 5000, 10000],
    'clf__C': [0.1, 1, 10, 50, 100],
    'clf__max_iter': [100, 500, 1000]
}

# Define the SVC pipeline
pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', stop_words=stopwords)),
    ('clf', LinearSVC(random_state=0))
])

# Perform grid search for SVC
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=10, scoring='f1_macro')
grid_search_svc.fit(X_train['input_text_lower'], y_train)  # Use the column name directly

print(f"Best parameters for SVC: {grid_search_svc.best_params_}")
print(f"Best score for SVC: {grid_search_svc.best_score_}")

In [37]:
# Get the best parameters and the best model
best_params = grid_search_svc.best_params_
best_model = grid_search_svc.best_estimator_

print("Best parameters found for SVC: ", best_params)
print("Best model found for SVC: ", best_model)

Best parameters found for SVC:  {'clf__C': 1, 'clf__max_iter': 100, 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 2)}
Best model found for SVC:  Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000, ngram_range=(1, 2),
                                 stop_words=['de', 'en', 'van', 'ik', 'te',
                                             'dat', 'die', 'in', 'een', 'hij',
                                             'het', 'niet', 'zijn', 'is', 'was',
                                             'op', 'aan', 'met', 'als', 'voor',
                                             'had', 'er', 'maar', 'om', 'hem',
                                             'dan', 'zou', 'of', 'wat', 'mijn', ...])),
                ('clf', LinearSVC(C=1, max_iter=100, random_state=0))])


In [38]:
# get the predictions for the validation set
val_preds = best_model.predict(X_val['input_text_lower'])

In [39]:
val_labels = y_val

In [40]:
pd.crosstab(val_labels, val_preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,46,11
1,10,69


In [None]:
print('classification report')
print(classification_report(val_preds, val_labels))
print('cohen kappa')
print(cohen_kappa_score(val_preds, val_labels))

classification report
              precision    recall  f1-score   support

           0       0.81      0.82      0.81        56
           1       0.87      0.86      0.87        80

    accuracy                           0.85       136
   macro avg       0.84      0.84      0.84       136
weighted avg       0.85      0.85      0.85       136

cohen kappa
0.6821015138023152


# Test Results

In [None]:
# read the reliability df
reliability_df = pd.read_csv('data/reliability_topics_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)

reliability_df.head()

# merge reliability_df with articles_df
reliability_df = pd.merge(reliability_df, articles_df, on='article_id', how = 'left')
print(reliability_df.shape)

# make an input text, combining Title, Text, Category and Keywords and before Category add string 'Categories: ' and before Keywords add string 'Keywords: ' if Category and Keywords are empty skip them
reliability_df['Category'] = reliability_df['Category'].fillna('')
reliability_df['Keywords'] = reliability_df['Keywords'].fillna('')
reliability_df['input_text'] = reliability_df['Text'] + '\n' + 'Categories: ' + reliability_df['Category'] + ' ' + 'Keywords: ' + reliability_df['Keywords']
reliability_df['input_text_lower'] = reliability_df['input_text'].apply(text_lower)

# make a test_df for all coders separately
test_df_elif = reliability_df[reliability_df['coder'] == 'Elif Kilik']

In [44]:
# get the predictions for the test set
test_preds_elif = best_model.predict(test_df_elif['input_text_lower'])

pd.crosstab(test_df_elif['about_covid'], test_preds_elif)

col_0,0,1
about_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,35,7
1.0,12,66


In [45]:
# classification report 
print('classification report')
print(classification_report(test_preds_elif, test_df_elif['about_covid']))

classification report
              precision    recall  f1-score   support

           0       0.83      0.74      0.79        47
           1       0.85      0.90      0.87        73

    accuracy                           0.84       120
   macro avg       0.84      0.82      0.83       120
weighted avg       0.84      0.84      0.84       120



In [None]:
# save the best model for svm
joblib.dump(best_model, 'model/svm_about_covid.pkl')

['model/svm_about_covid.pkl']