In [None]:
import json
import pandas as pd
import numpy as np
import os
import csv
import nltk
import spacy

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [None]:
df = pd.read_csv('path_to_actors_training_data_created_in_step3', sep=';', quoting=csv.QUOTE_NONNUMERIC, encoding = 'utf-8') # csv
# change article_id to integer
df['article_id'] = df['article_id'].astype(int)
# select only quoted actors
df = df[df['quoted'] == 1]

In [None]:
# read the annotations df with researcher codings
reliability_df = pd.read_csv('reliability_actors_final_cleaned_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)
reliability_df = reliability_df[reliability_df['coder'] == 'Elif Kilik']
train_df = df[~df['article_id'].isin(reliability_df['article_id'])]
test_df = df[df['article_id'].isin(reliability_df['article_id'])]

In [7]:
train_df.talks_covid_measures.value_counts()

talks_covid_measures
0.0    752
1.0    477
Name: count, dtype: int64

In [8]:
test_df.talks_covid_measures.value_counts()

talks_covid_measures
0.0    138
1.0    102
Name: count, dtype: int64

In [9]:
train_df.input_text.values[0]

'Buitenlandse Zaken:\n Het ministerie van Buitenlandse Zaken zegt dat de ambassade in Peking de situatie op de voet volgt.'

# Text preprocessing

In [None]:
# create a text preprocessing function where you lowercase the text and then lemmitize the text
def text_lower(text):
    text = text.lower()
    return text

train_df['input_text_lower'] = train_df['input_text'].apply(text_lower)

In [None]:
print(train_df['input_text'].values[0])
print(train_df['input_text_lower'].values[0])

Buitenlandse Zaken:
 Het ministerie van Buitenlandse Zaken zegt dat de ambassade in Peking de situatie op de voet volgt.
buitenlandse zaken:
 het ministerie van buitenlandse zaken zegt dat de ambassade in peking de situatie op de voet volgt.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['input_text_lower'] = train_df['input_text'].apply(text_lower)


# Tf-IDF + SVM

In [14]:
# Select your features and target variable
X = train_df[['input_text_lower']]  # This should remain a DataFrame
y = train_df['talks_covid_measures'].values.flatten()  # Convert to 1D array

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")  # Should be (n_samples_train, n_features)
print(f"y_train shape: {y_train.shape}")  # Should be (n_samples_train,)


X_train shape: (983, 1)
y_train shape: (983,)


In [15]:
# shape of the validation set
print(f"X_val shape: {X_val.shape}")  # Should be (n_samples_val, n_features)
print(f"y_val shape: {y_val.shape}")  # Should be (n_samples_val,)

X_val shape: (246, 1)
y_val shape: (246,)


In [16]:
# get the nr of 1 labels in y_train and y_val
print(f"Nr of 1 labels in y_train: {sum(y_train)}")
print(f"Nr of 1 labels in y_val: {sum(y_val)}")

Nr of 1 labels in y_train: 382.0
Nr of 1 labels in y_val: 95.0


In [17]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('dutch')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elifk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.




Best parameters for SVC: {'clf__C': 1, 'clf__max_iter': 100, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 3)}
Best score for SVC: 0.7418967892122451




In [19]:
# Get the best parameters and the best model
best_params = grid_search_svc.best_params_
best_model = grid_search_svc.best_estimator_

print("Best parameters found for SVC: ", best_params)
print("Best model found for SVC: ", best_model)

Best parameters found for SVC:  {'clf__C': 1, 'clf__max_iter': 100, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 3)}
Best model found for SVC:  Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 3),
                                 stop_words=['de', 'en', 'van', 'ik', 'te',
                                             'dat', 'die', 'in', 'een', 'hij',
                                             'het', 'niet', 'zijn', 'is', 'was',
                                             'op', 'aan', 'met', 'als', 'voor',
                                             'had', 'er', 'maar', 'om', 'hem',
                                             'dan', 'zou', 'of', 'wat', 'mijn', ...])),
                ('clf', LinearSVC(C=1, max_iter=100, random_state=0))])


In [20]:
# get the predictions for the validation set
val_preds = best_model.predict(X_val['input_text_lower'])

In [21]:
val_labels = y_val

In [22]:
pd.crosstab(val_labels, val_preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,120,31
1.0,34,61


In [None]:
print('classification report')
print(classification_report(val_preds, val_labels))

classification report
              precision    recall  f1-score   support

         0.0       0.79      0.78      0.79       154
         1.0       0.64      0.66      0.65        92

    accuracy                           0.74       246
   macro avg       0.72      0.72      0.72       246
weighted avg       0.74      0.74      0.74       246



# Test Results

In [None]:
test_df = pd.read_csv('path_to_test_data_researcher_codings',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC) #csv file

# change article_id to integer
test_df['article_id'] = test_df['article_id'].astype(int)
# drop if colnames has unnamed 
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
# keep only if directly_quoted or indirectly_quoted is 1
test_df = test_df[(test_df['directly_quoted'] == 1) | (test_df['indirectly_quoted'] == 1)]

In [29]:
# get article_id, actor_name, talks_covid_measures, relevant_sentences_string
test_df = test_df[['article_id', 'actor_name', 'talks_covid_measures', 'input_text_corrected',
       'talks_covid_corrected', 'measures_positive_corrected',
       'measures_negative_corrected', 'measures_neutral_corrected']]

In [None]:
test_df.talks_covid_corrected.value_counts() # these are manually corrected labels

talks_covid_corrected
0.0    181
1.0    114
Name: count, dtype: int64

In [32]:
# make column input_text
test_df['input_text_lower'] = test_df['input_text_corrected'].apply(text_lower)


In [36]:
test_preds_elif = best_model.predict(test_df['input_text_lower'])

test_labels_elif = test_df['talks_covid_corrected'].values.flatten()

In [37]:
# crosstab
pd.crosstab(test_labels_elif, test_preds_elif, rownames=['Actual'], colnames=['Predicted'])

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,142,39
1.0,50,64


In [None]:
print('classification report')
print(classification_report(test_preds_elif, test_labels_elif))

classification report
              precision    recall  f1-score   support

         0.0       0.78      0.74      0.76       192
         1.0       0.56      0.62      0.59       103

    accuracy                           0.70       295
   macro avg       0.67      0.68      0.68       295
weighted avg       0.71      0.70      0.70       295



In [None]:
# put the predictions in the test_df
test_df['talks_covid_pred_SVM'] = test_preds_elif
test_df.to_csv('path_to_save_test_df_with_predictions/actor_discusses_measures_preds_SVM.csv',
               sep = ';', encoding = 'utf-8', index = False, quoting=csv.QUOTE_NONNUMERIC)