In [None]:
import json
import pandas as pd
import numpy as np
import os
import csv
import nltk
import spacy

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [None]:
df = pd.read_csv('path_to_actors_training_data_created_in_step3', sep=';', quoting=csv.QUOTE_NONNUMERIC, encoding = 'utf-8') # csv
# change article_id to integer
df['article_id'] = df['article_id'].astype(int)
# select only quoted actors
df = df[df['quoted'] == 1]

In [None]:
# read the annotations df with researcher codings
reliability_df = pd.read_csv('reliability_actors_final_cleaned_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)
reliability_df = reliability_df[reliability_df['coder'] == 'Elif Kilik']
train_df = df[~df['article_id'].isin(reliability_df['article_id'])]
test_df = df[df['article_id'].isin(reliability_df['article_id'])]

In [4]:
# limit df only to article_ids that are not in the test_df
train_df = df[~df['article_id'].isin(reliability_df['article_id'])]
print(train_df.shape)

(1229, 13)


In [5]:
test_df = df[df['article_id'].isin(reliability_df['article_id'])]
print(test_df.shape)

(240, 13)


In [6]:
train_df.talks_covid_measures.value_counts()

talks_covid_measures
0.0    752
1.0    477
Name: count, dtype: int64

In [8]:
# keep only if talks_covid_measures is 1
train_df = train_df[train_df['talks_covid_measures'] == 1]
print(train_df.shape)

(477, 13)


In [9]:
train_df.input_text.values[0]

'Bruins:\n Minister Bruins voor Medische Zorg vindt het verschrikkelijk dat mensen met een Aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. \nBruins reageerde op vragen van onder anderen GroenLinks-Kamerlid Ellemeet. \nBruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de Tweede Kamer. \nBruins is niet van plan om evenementen en attracties te sluiten die veel Chinese toeristen trekken, zoals de Keukenhof.'

# Text preprocessing

In [None]:
# create a text preprocessing function where you lowercase the text and then lemmitize the text
def text_lower(text):
    text = text.lower()
    return text

train_df['input_text_lower'] = train_df['input_text'].apply(text_lower)

In [None]:
print(train_df['input_text'].values[0])
print(train_df['input_text_lower'].values[0])

Bruins:
 Minister Bruins voor Medische Zorg vindt het verschrikkelijk dat mensen met een Aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. 
Bruins reageerde op vragen van onder anderen GroenLinks-Kamerlid Ellemeet. 
Bruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de Tweede Kamer. 
Bruins is niet van plan om evenementen en attracties te sluiten die veel Chinese toeristen trekken, zoals de Keukenhof.
bruins:
 minister bruins voor medische zorg vindt het verschrikkelijk dat mensen met een aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. 
bruins reageerde op vragen van onder anderen groenlinks-kamerlid ellemeet. 
bruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de tweede kamer. 
bruins is niet van plan om evenementen en attracties te sluiten die veel chinese toeristen trekken, zoals de keukenhof.


In [13]:
train_df.measures_positive.value_counts()

measures_positive
0.0    309
1.0    168
Name: count, dtype: int64

# Tf-IDF + SVM

In [14]:
# Select your features and target variable
X = train_df[['input_text_lower']]  # This should remain a DataFrame
y = train_df['measures_positive'].values.flatten()  # Convert to 1D array

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")  # Should be (n_samples_train, n_features)
print(f"y_train shape: {y_train.shape}")  # Should be (n_samples_train,)


X_train shape: (381, 1)
y_train shape: (381,)


In [15]:
# shape of the validation set
print(f"X_val shape: {X_val.shape}")  # Should be (n_samples_val, n_features)
print(f"y_val shape: {y_val.shape}")  # Should be (n_samples_val,)

X_val shape: (96, 1)
y_val shape: (96,)


In [16]:
# get the nr of 1 labels in y_train and y_val
print(f"Nr of 1 labels in y_train: {sum(y_train)}")
print(f"Nr of 1 labels in y_val: {sum(y_val)}")

Nr of 1 labels in y_train: 134.0
Nr of 1 labels in y_val: 34.0


In [17]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('dutch')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elifk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Define the parameter grid
param_grid_svc = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [100, 2000, 1000, 5000, 10000],
    'clf__C': [0.1, 1, 10, 50, 100],
    'clf__max_iter': [100, 500, 1000]
}

# Define the SVC pipeline
pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', stop_words=stopwords)),
    ('clf', LinearSVC(random_state=0))
])

# Perform grid search for SVC
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='f1_macro')
grid_search_svc.fit(X_train['input_text_lower'], y_train)  # Use the column name directly

print(f"Best parameters for SVC: {grid_search_svc.best_params_}")
print(f"Best score for SVC: {grid_search_svc.best_score_}")

In [19]:
# Get the best parameters and the best model
best_params = grid_search_svc.best_params_
best_model = grid_search_svc.best_estimator_

print("Best parameters found for SVC: ", best_params)
print("Best model found for SVC: ", best_model)

Best parameters found for SVC:  {'clf__C': 1, 'clf__max_iter': 100, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 2)}
Best model found for SVC:  Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=5000, ngram_range=(1, 2),
                                 stop_words=['de', 'en', 'van', 'ik', 'te',
                                             'dat', 'die', 'in', 'een', 'hij',
                                             'het', 'niet', 'zijn', 'is', 'was',
                                             'op', 'aan', 'met', 'als', 'voor',
                                             'had', 'er', 'maar', 'om', 'hem',
                                             'dan', 'zou', 'of', 'wat', 'mijn', ...])),
                ('clf', LinearSVC(C=1, max_iter=100, random_state=0))])


In [20]:
# get the predictions for the validation set
val_preds = best_model.predict(X_val['input_text_lower'])

In [21]:
val_labels = y_val

In [22]:
pd.crosstab(val_labels, val_preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,54,8
1.0,18,16


In [None]:
print('classification report')
print(classification_report(val_preds, val_labels))

classification report
              precision    recall  f1-score   support

         0.0       0.87      0.75      0.81        72
         1.0       0.47      0.67      0.55        24

    accuracy                           0.73        96
   macro avg       0.67      0.71      0.68        96
weighted avg       0.77      0.73      0.74        96



# Test Results

In [None]:
test_df = pd.read_csv('path_to_test_data_researcher_codings',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC) #csv file

# change article_id to integer
test_df['article_id'] = test_df['article_id'].astype(int)
# drop if colnames has unnamed 
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
# keep only if directly_quoted or indirectly_quoted is 1
test_df = test_df[(test_df['directly_quoted'] == 1) | (test_df['indirectly_quoted'] == 1)]
# keep only if talks_covid_measures is 1
test_df = test_df[test_df['talks_covid_corrected'] == 1]

In [None]:
# get article_id, actor_name, talks_covid_measures, relevant_sentences_string
test_df = test_df[['article_id', 'actor_name', 'talks_covid_measures', 'input_text_corrected',
       'talks_covid_corrected', 'measures_positive_corrected',
       'measures_negative_corrected', 'measures_neutral_corrected']]

print(test_df.shape)

(114, 8)


In [26]:
# make column input_text
test_df['input_text_lower'] = test_df['input_text_corrected'].apply(text_lower)


In [29]:
test_preds_elif = best_model.predict(test_df['input_text_lower'])

test_labels_elif = test_df['measures_positive_corrected'].values.flatten()

In [30]:
# crosstab
pd.crosstab(test_labels_elif, test_preds_elif, rownames=['Actual'], colnames=['Predicted'])

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,64,10
1.0,24,16


In [None]:
print('classification report')
print(classification_report(test_preds_elif, test_labels_elif))

classification report
              precision    recall  f1-score   support

         0.0       0.86      0.73      0.79        88
         1.0       0.40      0.62      0.48        26

    accuracy                           0.70       114
   macro avg       0.63      0.67      0.64       114
weighted avg       0.76      0.70      0.72       114



In [None]:
# put the predictions in the df
test_df['covid_measures_positive_pred_SVM'] = test_preds_elif

test_df.to_csv('path_to_save_test_df_with_predictions/actor_positive_stance_preds_SVM.csv',
               sep = ';', encoding = 'utf-8', index = False, quoting=csv.QUOTE_NONNUMERIC)