In [2]:
import requests
import os
from dotenv import load_dotenv
load_dotenv("sjoerdAzure.env")  # Load environment variables from .env file
import time

import typing

from sklearn.metrics import cohen_kappa_score, classification_report
import krippendorff
import yaml

import pandas as pd

import config
import src
import tqdm
import json
import numpy as np
import logging

#import cltrier_lib as lib
import pyreadstat
import yaml
pd.set_option('display.max_colwidth', 100) 

In [3]:
#set up helper variables and functions:
CFG = config.Config()

def load_json(path: str):
    with open(path, encoding='utf-8') as fp:
        return json.load(fp)
    
#set option variables:

#set options to low temperature (0,1):
options_low_str = """
seed: 42
temperature: 0.1
"""

options_low = yaml.safe_load(options_low_str)

MODELsmall: str = 'llama3.1:8b-instruct-q6_K' # options: 'gemma:7b-instruct-q6_K', 'gemma2:27b-instruct-q6_K', 'llama3.1:8b-instruct-q6_K', 'llama3.1:70b-instruct-q6_K', 'mistral:7b-instruct-v0.3-q6_K', 'mistral-large:123b-instruct-2407-q6_K', 'mixtral:8x7b-instruct-v0.1-q6_K', 'mixtral:8x22b-instruct-v0.1-q6_K', 'phi3:14b-medium-128k-instruct-q6_K' or 'qwen2:72b-instruct-q6_K'
MODELlarge: str = 'llama3.1:70b-instruct-q6_K' # options: 'gemma:7b-instruct-q6_K', 'gemma2:27b-instruct-q6_K', 'llama3.1:8b-instruct-q6_K', 'llama3.1:70b-instruct-q6_K', 'mistral:7b-instruct-v0.3-q6_K', 'mistral-large:123b-instruct-2407-q6_K', 'mixtral:8x7b-instruct-v0.1-q6_K', 'mixtral:8x22b-instruct-v0.1-q6_K', 'phi3:14b-medium-128k-instruct-q6_K' or 'qwen2:72b-instruct-q6_K'
#apparently the 3.1 70b model is no longer available via Trier...
MODELsmall_v2: str = 'llama3.1:8b'
MODEL33large: str = 'llama3.3:70b' # options: 'gemma:7b-instruct-q6_K', 'gemma2:27b-instruct-q6_K', 'llama3.1:8b-instruct-q6_K', 'llama3.1:70b-instruct-q6_K', 'mistral:7b-instruct-v0.3-q6_K', 'mistral-large:123b-instruct-2407-q6_K', 'mixtral:8x7b-instruct-v0.1-q6_K', 'mixtral:8x22b-instruct-v0.1-q6_K', 'phi3:14b-medium-128k-instruct-q6_K' or 'qwen2:72b-instruct-q6_K'
MODELgpt4o = "nf-gpt-4o-2024-08-06" # in principe is er nu van elk model een nf (no filter) en een normale versie beschikbaar, de no filter versies zijn alleen voor onderzoekers beschikbaar voor analyze van content die niet door de filter heen zou komen.
MODELgpt4T = "nf-gpt-4-turbo" # Can be gpt-35-turbo, gpt-4-turbo, gpt-4 or Meta-Llama-3-8B-Instruct.

options_zero_str = """
seed: 42
temperature: 0
"""
options_zero = yaml.safe_load(options_zero_str)

temperature_0 : int = 0
SEED: int = 42
MAX10: int = 10
TOPP1: int = 1


options_large_str = """
seed: 42
temperature: 0
num_predict: 2000
"""
options_large = yaml.safe_load(options_large_str)

#load environment variables:
api_key = os.environ.get('sjoerd_key')

#setttings:
api_endpoint = "https://ai-research-proxy.azurewebsites.net/chat/completions"
api_endpoint_embed = "https://ai-research-proxy.azurewebsites.net/embeddings"
####### API REQUEST FORMATTING ######
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer " + api_key
}

In [17]:
#load data:
# Path to your SPSS file
file_path = "data/Naab2025/Naab, Gegenseitige Sanktionierung, DFG1_komplett.sav"

# Read the SPSS file
naab, meta = pyreadstat.read_sav(file_path)
naab

Unnamed: 0,v41,v42,v42a,v43,v46,v59,v47,v48,v49,v410,...,v1112_rec,v1113_rec,v1114_rec,v1115_rec,v1116_rec,v1117_rec,v1119_rec,v1118_allcomments_rec,v111_sum2,v111_allcomments2
0,10155561544357554_10155561654232554,97515118114_10155561544357554,9.751512e+10,,Philipp Kelm,,oha,0.0,0.0,0.0,...,,,,,,,,,,
1,10155561544357554_10155561658617554,97515118114_10155561544357554,9.751512e+10,,WELT,,Hier lest ihr mehr darüber: https://www.welt.de/vermischtes/article180885130/Schweiz-Schwarze-Sc...,1.0,0.0,0.0,...,,,,,,,,,,
2,10155561544357554_10155561662552554,97515118114_10155561544357554,9.751512e+10,,Eveline Velikonja,,Und die Frau steht da so seelenruhig 😥,22.0,0.0,2.0,...,,,,,,,,,,
3,10155561544357554_10155561667807554,97515118114_10155561544357554,9.751512e+10,,Heidi Guder,,Tim Warst Du das?!,0.0,0.0,0.0,...,,,,,,,,,,
4,10155561544357554_10155561670122554,97515118114_10155561544357554,9.751512e+10,10155561544357554_10155561662552554,WELT,,Willkommen in der Schweiz ¯\_(ツ)_/¯,13.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7533,97515118114_10157114488478115_6,97515118114_10157114488478115,9.751512e+10,,Carlo R.,,Wie wäre es mal mit Haft und anschließender Abschiebung? So wird das nichts mit den Wahlen in Ba...,2.0,,,...,,,,,,,,,,
7534,97515118114_10157114488478115_60,97515118114_10157114488478115,9.751512e+10,97515118114_10157114488478115_54,Hubert S.,,Dass solche Menschen keine Berechtigung haben in Deutschland zu leben ist bereits duch den Ausre...,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7535,97515118114_10157114488478115_7,97515118114_10157114488478115,9.751512e+10,,Ralf R.,,"Wir, das Volk, die Bürger, wollten es doch so. Mit überwältigender Mehrheit. Wozu also aufregen?",0.0,,,...,,,,,,,,,,
7536,97515118114_10157114488478115_8,97515118114_10157114488478115,9.751512e+10,,Franz,,"Ich möchte auf jeden Fall so lange Leben, um mitzuerleben, wie die Verantwortlichen, allen voran...",19.0,,,...,,,,,,,,,,


In [None]:
# Replace column names in the DataFrame with variable names from meta only if the label is not 'None', since the column include recodes etc.
columnset = [
    meta.column_labels[i] if meta.column_labels[i] else naab.columns[i]
    for i in range(len(naab.columns))
]
print(columnset)
naab.columns = columnset

['comment_id', 'facebook_post_id', 'medium', 'parent', 'user_name', 'subject', 'message', 'like', 'love', 'haha', 'wow', 'sad', 'angry', 'dislike', 'recommendation', 'favourite', 'reactions_retrieval_time', 'relevance', 'retrieval_time', 'plattform', 'sample2', 'Kodiererin_ID', 'Funktion AutorIn', 'Funktion AutorIn Sonstiges', 'Kommentartyp', 'Gestaltung', 'Verständlichkeit', 'Themenabweichung', 'Vereinfachung', 'Ausgewogenheit', 'Argumente', 'Quellenbelege', 'Zusatzwissen', 'Erfahrungsorientierung', 'Kontroverse', 'Lösungsvorschlag', 'Inzivilität', 'Emotionalität', 'Fragen', 'erleichternde Humorisierung', 'spaltende Humorisierung', 'Ironie', 'Anmutung', 'Anzahl BzgN', 'lfdNr BzgN', 'Bezugsobjekt', 'Bezug_Typa', 'Bezug_Typa_ID', 'echter oder anderer Username', 'Bezug Typ b oder c', 'Username_@', 'Form_Ansprache', 'Zitat', 'Bewertung Positionsäußerung', 'Bewertung Argumente', 'Bewertung Erfahrungsorientierung', 'Bewertung Belege', 'Bewertung Verständlichkeit', 'Bewertung Themenabweichun

In [34]:
# Verify the updated column names
naab.columns.to_list()

['comment_id',
 'facebook_post_id',
 'medium',
 'parent',
 'user_name',
 'subject',
 'message',
 'like',
 'love',
 'haha',
 'wow',
 'sad',
 'angry',
 'dislike',
 'recommendation',
 'favourite',
 'reactions_retrieval_time',
 'relevance',
 'retrieval_time',
 'plattform',
 'sample2',
 'Kodiererin_ID',
 'Funktion AutorIn',
 'Funktion AutorIn Sonstiges',
 'Kommentartyp',
 'Gestaltung',
 'Verständlichkeit',
 'Themenabweichung',
 'Vereinfachung',
 'Ausgewogenheit',
 'Argumente',
 'Quellenbelege',
 'Zusatzwissen',
 'Erfahrungsorientierung',
 'Kontroverse',
 'Lösungsvorschlag',
 'Inzivilität',
 'Emotionalität',
 'Fragen',
 'erleichternde Humorisierung',
 'spaltende Humorisierung',
 'Ironie',
 'Anmutung',
 'Anzahl BzgN',
 'lfdNr BzgN',
 'Bezugsobjekt',
 'Bezug_Typa',
 'Bezug_Typa_ID',
 'echter oder anderer Username',
 'Bezug Typ b oder c',
 'Username_@',
 'Form_Ansprache',
 'Zitat',
 'Bewertung Positionsäußerung',
 'Bewertung Argumente',
 'Bewertung Erfahrungsorientierung',
 'Bewertung Belege',


In [28]:
naab.head()

Unnamed: 0,comment_id,facebook_post_id,medium,parent,user_name,subject,message,like,love,haha,...,v1112_rec,v1113_rec,v1114_rec,v1115_rec,v1116_rec,v1117_rec,v1119_rec,v1118_allcomments_rec,"Anzahl an verwendeten Konfliktstilen mit 0er Kodierung (neu berechnet, nicht originäre Kodierung)","Verwendung Konfliktstil ja/nein mit 0er Kodierung (errechnet, nicht originäre Kodierung)"
0,10155561544357554_10155561654232554,97515118114_10155561544357554,97515120000.0,,Philipp Kelm,,oha,0.0,0.0,0.0,...,,,,,,,,,,
1,10155561544357554_10155561658617554,97515118114_10155561544357554,97515120000.0,,WELT,,Hier lest ihr mehr darüber: https://www.welt.de/vermischtes/article180885130/Schweiz-Schwarze-Sc...,1.0,0.0,0.0,...,,,,,,,,,,
2,10155561544357554_10155561662552554,97515118114_10155561544357554,97515120000.0,,Eveline Velikonja,,Und die Frau steht da so seelenruhig 😥,22.0,0.0,2.0,...,,,,,,,,,,
3,10155561544357554_10155561667807554,97515118114_10155561544357554,97515120000.0,,Heidi Guder,,Tim Warst Du das?!,0.0,0.0,0.0,...,,,,,,,,,,
4,10155561544357554_10155561670122554,97515118114_10155561544357554,97515120000.0,10155561544357554_10155561662552554,WELT,,Willkommen in der Schweiz ¯\_(ツ)_/¯,13.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Save as Parquet
naab.to_parquet(f'{CFG.report_dir}/Naab2025.parquet')