In [None]:
import pandas as pd
import numpy as np
import csv
import random
import spacy
import nltk

from transformers import set_seed
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, cohen_kappa_score

import os
# os.getcwd()

huggingface_cache_dir = 'model'

In [None]:
# read the model dataframe
df = pd.read_csv('path_to_model_df.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

In [None]:
# read the reliability df
reliability_df = pd.read_csv('NOS/nos_analysis/actor_analysis/coref_resolution/reliability_actors_final_cleaned_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)
test_df = reliability_df[reliability_df['coder'] == 'Elif Kilik']

In [None]:
# limit df only to article_ids that are not in the test_df
df = df[~df['article_id'].isin(reliability_df['article_id'])]

In [6]:
df.quoted_check.value_counts()

quoted_check
1.0    1406
0.0     870
Name: count, dtype: int64

In [7]:
# remove quoted, make quoted_check quoted
df.drop(columns = ['quoted'], inplace = True)
df.rename(columns = {'quoted_check': 'quoted'}, inplace = True)

In [9]:
# drop if relevant_sentences_string is null
df = df.dropna(subset = ['relevant_sentences_string'])
print(df.shape)

(2269, 10)


In [10]:
# make a new input text column where you combine entity_name and quoted text with \n
df['input_text'] = df['entity_name'] + '\n' + df['relevant_sentences_string']
df['input_text'].values[:5]

array(['LCR\nHet ministerie adviseert reizigers de adviezen van het Landelijk Coordinatiecentrum Reizigersadvisering (LCR) en de Wereldgezondheidsorganisatie (WHO) in de gaten te houden. \n"In het algemeen geldt het advies voor reizigers om markten met vis of levende dieren te vermijden en om algemene hygiënemaatregelen te nemen, ook vanwege het risico op de vogelgriep", zegt het LCR.',
       "McDonald's\nOok McDonald's sluit zijn vestigingen in vijf steden.",
       'Rijksinstituut voor Volksgezondheid en Milieu\nDe luchthaven volgt daarmee de lijn van het Rijksinstituut voor Volksgezondheid en Milieu (RIVM).',
       'CCTV\nDat is bekendgemaakt op de Chinese staatszender CCTV.',
       'Xi Jinping\nBekijk in de reportage van Sjoerd den Daas hoe de bewoners van Peking omgaan met de ziekte:\n President Xi Jinping hield gisteren spoedberaad met de leiding van de communistische partij.'],
      dtype=object)

In [11]:
# calculate the token size for input_text values
df['token_size'] = df['input_text'].apply(lambda x: len(x.split()))
df['token_size'].describe()

count    2269.000000
mean       59.903482
std        58.536991
min         5.000000
25%        24.000000
50%        40.000000
75%        73.000000
max       521.000000
Name: token_size, dtype: float64

# Text preprocessing

In [13]:
print(df['input_text'].values[0])

# create a text preprocessing function where you lowercase the text and then lemmitize the text
def text_lower(text):
    text = text.lower()
    return text

df['input_text_lower'] = df['input_text'].apply(text_lower)
print(df['input_text_lower'].values[0])

LCR
Het ministerie adviseert reizigers de adviezen van het Landelijk Coordinatiecentrum Reizigersadvisering (LCR) en de Wereldgezondheidsorganisatie (WHO) in de gaten te houden. 
"In het algemeen geldt het advies voor reizigers om markten met vis of levende dieren te vermijden en om algemene hygiënemaatregelen te nemen, ook vanwege het risico op de vogelgriep", zegt het LCR.
lcr
het ministerie adviseert reizigers de adviezen van het landelijk coordinatiecentrum reizigersadvisering (lcr) en de wereldgezondheidsorganisatie (who) in de gaten te houden. 
"in het algemeen geldt het advies voor reizigers om markten met vis of levende dieren te vermijden en om algemene hygiënemaatregelen te nemen, ook vanwege het risico op de vogelgriep", zegt het lcr.


# Tf-IDF + SVM

In [15]:
# Select your features and target variable
X = df[['input_text_lower']]  # This should remain a DataFrame
y = df['quoted'].values.flatten()  # Convert to 1D array

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

print(f"X_train shape: {X_train.shape}")  # Should be (n_samples_train, n_features)
print(f"y_train shape: {y_train.shape}")  # Should be (n_samples_train,)


X_train shape: (2042, 1)
y_train shape: (2042,)


In [19]:
# shape of the validation set
print(f"X_val shape: {X_val.shape}")  # Should be (n_samples_val, n_features)
print(f"y_val shape: {y_val.shape}")  # Should be (n_samples_val,)

X_val shape: (227, 1)
y_val shape: (227,)


In [20]:
# get the nr of 1 labels in y_train and y_val
print(f"y_train 1 labels: {np.sum(y_train)}")
print(f"y_val 1 labels: {np.sum(y_val)}")

y_train 1 labels: 1263.0
y_val 1 labels: 140.0


In [21]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('dutch')

[nltk_data] Downloading package stopwords to /home/eklk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Define the parameter grid
param_grid_svc = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [100, 2000, 1000, 5000, 10000],
    'clf__C': [0.1, 1, 10, 50, 100],
    'clf__max_iter': [100, 500, 1000]
}

# Define the SVC pipeline
pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', stop_words=stopwords)),
    ('clf', LinearSVC(random_state=0))
])

# Perform grid search for SVC
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=10, scoring='f1_macro')
grid_search_svc.fit(X_train['input_text_lower'], y_train)  # Use the column name directly

print(f"Best parameters for SVC: {grid_search_svc.best_params_}")
print(f"Best score for SVC: {grid_search_svc.best_score_}")




Best parameters for SVC: {'clf__C': 1, 'clf__max_iter': 100, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Best score for SVC: 0.7657852556376924




In [23]:
# Get the best parameters and the best model
best_params = grid_search_svc.best_params_
best_model = grid_search_svc.best_estimator_

print("Best parameters found for SVC: ", best_params)
print("Best model found for SVC: ", best_model)

Best parameters found for SVC:  {'clf__C': 1, 'clf__max_iter': 100, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Best model found for SVC:  Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=5000,
                                 stop_words=['de', 'en', 'van', 'ik', 'te',
                                             'dat', 'die', 'in', 'een', 'hij',
                                             'het', 'niet', 'zijn', 'is', 'was',
                                             'op', 'aan', 'met', 'als', 'voor',
                                             'had', 'er', 'maar', 'om', 'hem',
                                             'dan', 'zou', 'of', 'wat', 'mijn', ...])),
                ('clf', LinearSVC(C=1, max_iter=100, random_state=0))])


In [24]:
# get the predictions for the validation set
val_preds = best_model.predict(X_val['input_text_lower'])

In [25]:
val_labels = y_val

In [26]:
pd.crosstab(val_labels, val_preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,52,35
1.0,22,118


In [None]:
print('classification report')
print(classification_report(val_preds, val_labels))

classification report
              precision    recall  f1-score   support

         0.0       0.60      0.70      0.65        74
         1.0       0.84      0.77      0.81       153

    accuracy                           0.75       227
   macro avg       0.72      0.74      0.73       227
weighted avg       0.76      0.75      0.75       227



# Test Results

In [None]:
df = pd.read_csv('path_to_model_df_checked.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)
df_elif = df[df['article_id'].isin(test_df['article_id'])]
df_elif['input_text_lower'] = df_elif['input_text'].apply(text_lower)

In [31]:
test_preds_elif = best_model.predict(df_elif['input_text_lower'])

test_labels_elif = df_elif['quoted_check'].values.flatten()

In [None]:
print('classification report')
print(classification_report(test_preds_elif, test_labels_elif))

classification report
              precision    recall  f1-score   support

         0.0       0.65      0.84      0.73       176
         1.0       0.89      0.76      0.82       325

    accuracy                           0.78       501
   macro avg       0.77      0.80      0.78       501
weighted avg       0.81      0.78      0.79       501



In [None]:
df_elif['quoted_pred'] = test_preds_elif
# save the df_elif
df_elif.to_csv('path_to_SVM_quote_classifier.csv',
               sep = ';', encoding = 'utf-8', index = False, quoting=csv.QUOTE_NONNUMERIC)