In [None]:
import pandas as pd
import numpy as np
import os
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, cohen_kappa_score

import random
from transformers import set_seed
import nltk
import spacy
# !python -m spacy download nl_core_news_sm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# go one level up in the directory
os.chdir("/data/500gbstorage/actor_classification")


huggingface_cache_dir = 'model'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

In [4]:
set_seed(42)
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)

In [5]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('dutch')

[nltk_data] Downloading package stopwords to /home/eklk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [None]:
df = pd.read_csv('data/coded_df_topics_full.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(df.shape)

df = df[df['about_covid'] == 1]
print(df.shape)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

In [None]:
# articles df
articles_df = pd.read_csv('data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Title', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)
# change LINE BREAK to \n
articles_df['Text'] = articles_df['Text'].str.replace('[LINE_BREAK]', '\n')

In [None]:
# make an input text, combining Title, Text, Category and Keywords and before Category add string 'Categories: ' and before Keywords add string 'Keywords: ' if Category and Keywords are empty skip them
articles_df['Category'] = articles_df['Category'].fillna('')
articles_df['Keywords'] = articles_df['Keywords'].fillna('')
articles_df['input_text'] = articles_df['Title'] + '\n' + articles_df['Text'] + '\n' + 'Categories: ' + articles_df['Category'] + ' ' + 'Keywords: ' + articles_df['Keywords']

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

# Text preprocessing

In [11]:
print(df['input_text'].values[0])

# create a text preprocessing function where you lowercase the text and then lemmitize the text
def text_lower(text):
    text = text.lower()
    return text

df['input_text_lower'] = df['input_text'].apply(text_lower)
print(df['input_text_lower'].values[0])

Minister roept iedereen op: niet discrimineren om coronavirus 
Minister Bruins voor Medische Zorg vindt het verschrikkelijk dat mensen met een Aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. Hij deed in de Tweede Kamer een oproep aan iedereen om hiertegen op te staan.
Bruins reageerde op vragen van onder anderen GroenLinks-Kamerlid Ellemeet. Zij zei dat mensen met Aziatisch uiterlijk op grote schaal worden gediscrimineerd. Ze hoorde bijvoorbeeld van een meisje dat mensen in de tram hun trui over hun mond trokken toen ze haar zagen.
Ellemeet vroeg de minister of hij zich hierover duidelijk wil uitspreken. Bruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de Tweede Kamer.
De minister zei dat hij het er zeer mee eens is dat dit niet bij een fatsoenlijke samenleving hoort. "Mensen discrimineren gaat niet aan. We moeten ervoor zorgen dat het niet optreedt. Daar hebben wij allemaal een rol in."
Bruins is niet van plan om evenement

In [None]:
train_df = df[df['reliability_article'] == 0]
test_df = df[df['reliability_article'] == 1]

In [20]:
# Select your features and target variable
X = train_df[['input_text_lower']]  # This should remain a DataFrame
y = train_df['topic_e'].values.flatten()  # Convert to 1D array

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")  # Should be (n_samples_train, n_features)
print(f"y_train shape: {y_train.shape}")  # Should be (n_samples_train,)


X_train shape: (328, 1)
y_train shape: (328,)


In [21]:
# see the nr of 1's in y_train
print(np.sum(y_train == 1))
print(np.sum(y_val == 1))

21
5


# TF-IDF + SVM

## Gridsearch

In [None]:
# Define the parameter grid
param_grid_svc = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [100, 2000, 1000, 5000, 10000],
    'clf__C': [0.1, 1, 10, 50, 100],
    'clf__max_iter': [100, 500, 1000]
}

# Define the SVC pipeline
pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', stop_words=stopwords)),
    ('clf', LinearSVC(random_state=0))
])

# Perform grid search for SVC
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=10, scoring='f1_macro')
grid_search_svc.fit(X_train['input_text_lower'], y_train)  # Use the column name directly

print(f"Best parameters for SVC: {grid_search_svc.best_params_}")
print(f"Best score for SVC: {grid_search_svc.best_score_}")





Best parameters for SVC: {'clf__C': 10, 'clf__max_iter': 100, 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}
Best score for SVC: 0.965079365079365




In [23]:
# Get the best parameters and the best model
best_params = grid_search_svc.best_params_
best_model = grid_search_svc.best_estimator_

print("Best parameters found for SVC: ", best_params)
print("Best model found for SVC: ", best_model)

Best parameters found for SVC:  {'clf__C': 10, 'clf__max_iter': 100, 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}
Best model found for SVC:  Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=['de', 'en', 'van', 'ik', 'te',
                                             'dat', 'die', 'in', 'een', 'hij',
                                             'het', 'niet', 'zijn', 'is', 'was',
                                             'op', 'aan', 'met', 'als', 'voor',
                                             'had', 'er', 'maar', 'om', 'hem',
                                             'dan', 'zou', 'of', 'wat', 'mijn', ...])),
                ('clf', LinearSVC(C=10, max_iter=100, random_state=0))])


In [24]:
# get the predictions for the validation set
val_preds = best_model.predict(X_val['input_text_lower'])

In [25]:
val_labels = y_val

In [None]:
print('classification report')
print(classification_report(val_preds, val_labels))
print('cohen kappa')
print(cohen_kappa_score(val_preds, val_labels))

classification report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        78
           1       0.80      1.00      0.89         4

    accuracy                           0.99        82
   macro avg       0.90      0.99      0.94        82
weighted avg       0.99      0.99      0.99        82

cohen kappa
0.8825214899713467


# Test Results

In [None]:
# read the reliability df
reliability_df = pd.read_csv('data/reliability_topics_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)

reliability_df.head()

# merge reliability_df with articles_df
reliability_df = pd.merge(reliability_df, articles_df, on='article_id', how = 'left')
print(reliability_df.shape)

# make an input text, combining Title, Text, Category and Keywords and before Category add string 'Categories: ' and before Keywords add string 'Keywords: ' if Category and Keywords are empty skip them
reliability_df['Category'] = reliability_df['Category'].fillna('')
reliability_df['Keywords'] = reliability_df['Keywords'].fillna('')
reliability_df['input_text'] = reliability_df['Text'] + '\n' + 'Categories: ' + reliability_df['Category'] + ' ' + 'Keywords: ' + reliability_df['Keywords']
reliability_df['input_text_lower'] = reliability_df['input_text'].apply(text_lower)
# make a test_df for all coders separately
test_df_elif = reliability_df[reliability_df['coder'] == 'Elif Kilik']

In [None]:
test_preds_elif = best_model.predict(test_df_elif['input_text_lower'])

In [31]:
# classification report 
print('classification report')
print(classification_report(test_preds_elif, test_df_elif['topic_e']))

classification report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       114
           1       0.75      1.00      0.86         6

    accuracy                           0.98       120
   macro avg       0.88      0.99      0.92       120
weighted avg       0.99      0.98      0.98       120



In [32]:
# save the best model for svm
import joblib
joblib.dump(best_model, 'model/svm_topic_e.pkl')

['model/svm_topic_e.pkl']