In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import textwrap
from IPython.display import clear_output
import numpy as np
from ipywidgets import Button, HTML, Output
import os
import matplotlib.pyplot as plt
import numpy as np
import json 

In [2]:
INPUT_PATH = "../../data/lda/df_30topics10passes.csv"
RATER = "Quirin"

In [3]:
df_topics = pd.read_csv(INPUT_PATH)

In [4]:
topic_id = 19
prob_range = (0.25, 0.45)
n = 100
seed = 42


lower_prob, upper_prob = prob_range
prob_col = f"topic_{topic_id}"
candidates = df_topics[(df_topics[prob_col] >= lower_prob) & (df_topics[prob_col] <= upper_prob)]

sampled = candidates.sample(n=min(n, len(candidates)), random_state=seed).reset_index(drop=True)
ratings = []

output_path = f"../../data/lda/manual_ratings_topic{topic_id}_{RATER}.json"

In [5]:
print("Sanity check: speech with max topic score\n\t", df_topics[df_topics[prob_col] == (df_topics[prob_col]).max()]["translatedText"].iloc[0])

Sanity check: speech with max topic score
	 I voted in favour of this report. Europe has been paralysed with regard to the migrant crisis. While we keep discussing possible solutions to the grave problem we face, it hardly ever comes to common action. However, even when we do decide on joint measures, we do not follow through on them. The Dublin Agreement and the relocation system have failed utterly, as the Member States have not committed to apply them. We must either effectively enforce the current measures, or introduce new ones. Either way, it is not a problem that can be solved by any Member State alone. It is a dilemma that must be dealt with by the European Union as a whole. It cannot be fixed by closing national borders, but only by guarding our common ones; those defined by Schengen. What we need to do is form a common border guard and coast guard and put a collective reception and relocation system in place. Furthermore, we must prompt Turkey to crack down on the criminals s

In [6]:
def process_response(prompt= "Rate (y, n, q):"): 
    answer = input(prompt)
    if answer == 'q':
        return -1
    elif answer == 'y': 
        return 1
    elif answer == 'n': 
        return 0
    else: 
        return process_response("Try again (y, n, q):")

def get_rating(sampled, index, prob_col):
    row = sampled.iloc[index]

    print(f"Progress: {index}/{len(sampled)} speeches rated")
    print("\n")
    # print(f"\nSpeech {index}/{len(sampled)} | Year: {row['year']} | Party block: {row['block']}")
    print(f"Topic probability: {row[prob_col]:.4f}")
    print(textwrap.fill(row["translatedText"], width=90))

    return process_response()

In [7]:
running = True
if len(ratings) == len(sampled): 
    print("Ratings are done.")
else: 
    print(f"Starting manual rating: {len(ratings)}/{len(sampled)} speeches to rate.")
    print(f"{'='*90}\n")

    out = Output()
    display(out) 
    with out: 
        while running: 
            clear_output()

            current_index = len(ratings)
            response = get_rating(sampled, current_index, prob_col)
            if response == -1: 
                clear_output()
                print("Terminated.")
                print("Continue at index", current_index)
                running = False
            else: 
                ratings.append(response)


            if current_index >= len(sampled)-1:
                print(f"Rating complete! Rated {len(ratings)}/{len(sampled)} speeches.")
                # Now add ratings to dataframe
                sampled.loc[:len(ratings)-1, 'rating'] = ratings
                print(f"\nFinal ratings: {ratings}")
                running = False
print(ratings)

Starting manual rating: 0/100 speeches to rate.



Output()

[]


In [14]:
# save as json
ratings = sampled['rating'].tolist()
probabilities = sampled[prob_col].tolist()

ratings_dict = {
    "ratings": ratings,
    "probabilities": probabilities
}
json.dump(ratings_dict, open(output_path, "w"))
print(f"Saved ratings to {output_path}")

Saved ratings to ../../data/lda/manual_ratings_topic19_Quirin.json


## Merge ratings from two authors

In [None]:
RATER_1 = "Quirin"
RATER_2 = "Jakob"
path_ratings1 = f"../../data/lda/manual_ratings_topic{topic_id}_{RATER_1}.json"
path_ratings2 = f"../../data/lda/manual_ratings_topic{topic_id}_{RATER_2}.json"

In [None]:
r1 = json.load(open(path_ratings1, "r"))
r2 = json.load(open(path_ratings2, "r"))

ratings1 = np.ndarray(r1["ratings"])
ratings2 = np.ndarray(r2["ratings"])

probabilities1 = r1["probabilities"]
probabilities2 = r2["probabilities"]
assert probabilities1 == probabilities2, "Probabilities do not match between raters!"

In [None]:

disagreements = np.array([np.abs(r1 - r2) for r1, r2 in zip(ratings1, ratings2)])
new_ratings = dict()

for sample_index, r in sampled[disagreements == 1].iterrows():
    print(sample_index, r[prob_col])
    print("Rater 1:", ratings1[sample_index])
    print("Rater 2:", ratings2[sample_index])

    print(textwrap.fill(r["translatedText"], width=90))
    print("="*100)

    response = process_response("Rate again. q to quit, y for keep, n for discard")
    if response == -1: 
        break 
    new_ratings[sample_index] = response
    clear_output()

ratings_merged = ratings1.copy()
for i, new_rating in new_ratings.items():
    ratings_merged[int(i)] = new_rating

# save ratings merged to json
ratings_dict["ratings_merged"] = ratings_merged
with open(output_path, 'w') as f:
    json.dump(ratings_dict, f)
print(f"Saved merged ratings to {output_path}")