In [170]:
from openai import OpenAI

with open('API_KEY', 'r') as file:
    API_KEY = file.read().strip()

client = OpenAI(
    api_key=API_KEY
)
# defaults to getting the key using os.environ.get("OPENAI_API_KEY")


In [171]:
import pandas as pd
import numpy as np

df1 = pd.read_csv('similar_sensitive_words.csv')
df2 = pd.read_csv('sensitivity_analysis.csv')

In [172]:
# normalise the values of "sensitivity_score" column of df1 and df2 to range 0 and 1
df1['sensitivity_score'] = (df1['sensitivity_score'] - df1['sensitivity_score'].min()) / (df1['sensitivity_score'].max() - df1['sensitivity_score'].min())
df2['sensitivity_score'] = (df2['sensitivity_score'] - df2['sensitivity_score'].min()) / (df2['sensitivity_score'].max() - df2['sensitivity_score'].min())
# join two df of same structure
df = pd.concat([df1, df2])
# in case of duplicates in similar_word, join values of "input_word" dont loose columns "sensitivity_score"
df = df.groupby('similar_word').agg({'input_word': lambda x: ', '.join(set(', '.join(x).split(', '))), 'sensitivity_score': "mean"}).reset_index()
# sort by sensitivity_score
df = df.sort_values('sensitivity_score', ascending=False).reset_index(drop=True)

In [173]:
df = pd.read_csv('joined_sensitive_words.csv')
# each request should be just 5 terms
n = 5

df_chunks = [df[i:i + n] for i in range(0, df.shape[0], n)]

# to be carefull of token limits drop most rows
df_chunks = df_chunks[0:4]

# convert the list of chunks to a list of lists turned into strings with the similar_word column
# requests = [[str(l) for l in df_chunk["similar_word"]] for df_chunk in df_chunks]
requests = [
    [
        ("word: " + row["similar_word"], "word_cloud_reference: " + row["input_word"])
        for _, row in df_chunk.iterrows()
    ]
    for df_chunk in df_chunks
]

print(f"{requests[0]}")


[('word: discrimination', 'word_cloud_reference: racial'), ('word: profiling', 'word_cloud_reference: racial'), ('word: radicalism', 'word_cloud_reference: foreign'), ('word: extremism', 'word_cloud_reference: racial, antisemitism'), ('word: repression', 'word_cloud_reference: homeland')]


In [179]:
print(df_chunks[1].head())

      similar_word    input_word  sensitivity_score
5         virulent  antisemitism           0.863442
6   discriminatory        racial           0.854908
7       oppression       slavery           0.854908
8  totalitarianism      homeland           0.853485
9    ideologically        racial           0.852063


In [189]:
gpt3 = "gpt-3.5-turbo"
gpt4 = "gpt-4-turbo-preview"
# Choose the model to use
modeel = gpt4

english = "English"
german = "German"
englisch = "Englisch"
deutsch = "Deutsch"

# Choose the input language
input_language = english
input_sprache = englisch

# Choose language of system prompt to determine the output language
sys_prompt_english = f"""Your role is to assess the sensitivity of a list of words provided in {input_language}. For each word, you will assign a sensitivity rating from 0 to 1, where 1 indicates high sensitivity. Additionally, you will get the word_cloud_reference word or words that contributed to our growing word cloud as help to find a potentially sensitive meaning. For each assessed word, you will provide a short analysis, encompassing a definition, a discussion on the sensitivity of the word, and propose options for translating it between English and German. Each paragraph should be concise, with a maximum of 350 tokens, ensuring insightful analysis while using language respectfully and accurately, highlighting cultural and contextual nuances.
        
    Incorporate the principles and goals of macht.sprache. to support users in translating more sensitively between German and English. This includes recognizing and addressing linguistic discrimination, promoting expressions that challenge such discrimination, and fostering awareness for the sensitive handling of political terms in translations. Emphasize the importance of continuity, collaboration, creativity, and accessibility in this process. Acknowledge that while macht.sprache. aims to guide users, it cannot assume responsibility for the sensitivity of translations by individuals, underscoring the importance of self-education.
    
    Consider the perspectives provided by macht.sprache. on recognizing power and privileges, increasing awareness for justice, and choosing words that minimize harm over those that cause it. All translation decisions are political, and this perspective should guide the sensitivity assessment and translation options provided. The collaborative and ongoing nature of macht.sprache., its foundation in diverse expert contributions, and its commitment to pragmatism, accessibility, and creativity in translations are integral to your analysis and recommendations.
    
    Outputs should be formatted as JSON, including the following keys for each word in the list: "word", "sensitivity_rating", "definition", and "translation_options". Each "translation_options" entry should contain a list of 4 "options", each with its "nuance"."""


sys_prompt_german = """Ihre Aufgabe besteht darin, die Sensitivität einer Liste von Wörtern, die in {input_sprache} vorliegen, zu beurteilen. Sie sollen für jedes Wort eine Sensitivitätsbewertung zwischen 0 und 1 vergeben, wobei 1 die höchste Sensitivität darstellt. Zusätzlich erhalten Sie das ursprüngliche Eingabewort, das zu unserer sich erweiternden Wortwolke beiträgt, um möglicherweise sensible Bedeutungen identifizieren zu können. Für jedes beurteilte Wort sollen Sie eine knappe Analyse liefern, die eine Definition, ein Rating der Sensitivität des Wortes und Empfehlungen für Übersetzungen zwischen Englisch und Deutsch umfasst. Jeder Absatz sollte mit einem maximum von 350 token kurz und bündig sein. Ihre Analyse sollte einsichtsreich sein und die Sprache respektvoll und präzise nutzen, wobei kulturelle und kontextuelle Feinheiten berücksichtigt werden.

Beziehen Sie die Prinzipien und Ziele von macht.sprache. ein, um Nutzern zu helfen, zwischen Deutsch und Englisch sensibler zu übersetzen. Dies schließt die Erkennung und Bekämpfung von sprachlicher Diskriminierung ein, das Eintreten für Ausdrücke, die dieser Diskriminierung entgegenwirken, und das Schaffen von Bewusstsein für den sensiblen Umgang mit politischen Begriffen in Übersetzungen. Unterstreichen Sie die Wichtigkeit von Beständigkeit, Zusammenarbeit, Kreativität und Zugänglichkeit in diesem Prozess. Machen Sie deutlich, dass macht.sprache. zwar Orientierung bietet, jedoch nicht die Verantwortung für die Sensitivität individueller Übersetzungen übernehmen kann, was die Bedeutung von Eigenverantwortung hervorhebt.

Berücksichtigen Sie die Ansichten von macht.sprache. zur Anerkennung von Machtverhältnissen und Privilegien, zur Förderung eines Bewusstseins für Gerechtigkeit und zur Auswahl von Wörtern, die weniger Schaden anrichten, anstatt zu verletzen. Jede Entscheidung bei der Übersetzung ist politisch, und diese Perspektive sollte Ihre Bewertung der Sensitivität und die vorgeschlagenen Übersetzungsmöglichkeiten leiten. Die kooperative und fortlaufende Natur von macht.sprache., basierend auf dem Input verschiedener Experten und dem Engagement für Pragmatismus, Zugänglichkeit und Kreativität in der Übersetzung, sollte ein zentraler Bestandteil Ihrer Analyse und Empfehlungen sein.

Die Ergebnisse sollen auf deutsch geschrieben und als JSON formatiert werden, mit den folgenden keys für jedes Wort in der Liste: "word" (Wort), "sensitivity_rating" (Sensitivitätsbewertung), "definition" und "translation_options" (Übersetzungsoptionen). Jeder Eintrag bei "translation_options" soll eine Liste von vier "options" (Optionen) sein, und jede Übersetzungsoption soll weiter mit ihrer eigenen "nuance" (Feinheit) beschrieben werden."""

# The number of tokens used in the completion
tok = 0
append = False

# Specify the file name to append to
file_name = f"output_english.json"

# looping over all chunks of the list and send API requests with thourough instructions
for request in requests[0:3]:
    completion = client.chat.completions.create(
        model=gpt4,
        # response_format={ "type": "json_object" }, # argument that forces the response to be a json object but this seems to not work on a list of words as requests
        messages=[
            {"role": "system", "content": f"{sys_prompt_english}"},
            {"role": "user", "content": f"{request}"},
        ],
        # low temperature because we want consistent output
        temperature=0.1,
    )

    # before appending to the file delete anything before the first "[" and after the last "]"
    txt = "[" + completion.choices[0].message.content.split('[', 1)[1]
    # Split the string into a list using '[' as the separator
    parts = txt.split(']')
    # Reconstruct the string
    output = ']'.join(parts[0:-1]) + ']'

    file_content = None
    # if file is not empty replace last character with a comma
    with open(file_name, 'r') as file:
        # Read the file
        file_content = file.read()
    
    # if file is not empty replace last "[" with a comma
    if file_content:
        file_content = file_content[:-2] + ","
        append = True
    else:
        file_content = ""
    
    # Open the file in append mode to add additional content instead of overwriting the content
    with open(file_name, 'w') as file:
        # Use the write() method to write the string to the file
        if append:
            file.write(file_content[1:]+output)
        file.write(file_content+output)

    # how many tokens were used
    tok = tok + completion.usage.total_tokens


# add to csv file a line with the number of used tokens
with open('tokens_used.csv', 'a') as file:
    file.write(f"{tok}\n")


print(completion.choices[0].message.content)

```json
[
    {
        "word": "neoliberal",
        "sensitivity_rating": 0.7,
        "definition": "Neoliberalism refers to a political-economic philosophy that emphasizes free-market capitalism, deregulation, and reduction in government spending.",
        "translation_options": [
            {
                "options": "neoliberal",
                "nuance": "Direct translation, captures the economic and political philosophy."
            },
            {
                "options": "Marktliberal",
                "nuance": "Emphasizes the market liberal aspect, suitable for economic contexts."
            },
            {
                "options": "wirtschaftsliberal",
                "nuance": "Focuses on the economic liberalism aspect, highlighting economic policies."
            },
            {
                "options": "Anhänger des Neoliberalismus",
                "nuance": "Refers to proponents of neoliberalism, useful for discussing groups or movements."
            }

In [187]:
print(output[1:])


    {
        "word": "virulent",
        "sensitivity_rating": 0.7,
        "definition": "Extremely severe or harmful in its effects, or bitterly hostile.",
        "translation_options": [
            {
                "options": "giftig",
                "nuance": "Literally means 'poisonous', capturing the harmful aspect but not the hostility."
            },
            {
                "options": "bösartig",
                "nuance": "Conveys a sense of malignancy or malevolence, suitable for describing hostile attitudes."
            },
            {
                "options": "heftig",
                "nuance": "Translates to 'intense' or 'fierce', emphasizing the severity without the connotation of hostility."
            },
            {
                "options": "aggressiv",
                "nuance": "Directly captures the hostility aspect, applicable in contexts of virulent rhetoric."
            }
        ]
    },
    {
        "word": "discriminatory",
        "sensit

In [182]:
tok

3407

In [183]:
print(completion.choices[0].message.content)

{
  "word": "repression",
  "sensitivity_rating": 0.8,
  "definition": "Repression refers to the act of suppressing or holding back someone or something, often through force or by using authority. In political contexts, it can denote the suppression of dissent, freedom of speech, or other civil liberties.",
  "translation_options": [
    {
      "options": "Unterdrückung",
      "nuance": "General term for suppression, widely used in both political and psychological contexts."
    },
    {
      "options": "Repression",
      "nuance": "Borrowed directly from English, used specifically in political contexts to denote government suppression."
    },
    {
      "options": "Niederschlagung",
      "nuance": "Implies a forceful or violent suppression, often used in the context of protests or uprisings."
    },
    {
      "options": "Zurückdrängung",
      "nuance": "More neutral, can be used in various contexts but lacks the specific political charge."
    }
  ]
}


In [193]:
try:
    # Attempt to read the existing content
    with open(file_name, 'r+') as file:  # Open for reading and writing ('r+')
        file_content = file.read()
        
        # If the file is not empty, prepare it for appending new content
        if file_content:
            # Assuming you want to replace the last two characters (e.g., "]\n") with a comma ","
            file.seek(0)  # Go back to the start of the file
            file_content = file_content[:-2] + ","  # Modify the content as needed
            file.write(file_content)  # Write the modified content back
            file.truncate()  # Remove the rest of the original content beyond this point
            
        file.write(output)  # Append the new content

except FileNotFoundError:
    # If the file does not exist, create it and write the content
    with open(file_name, 'w') as file:
        file.write(output)


In [194]:
    file_content = None
    # if file is not empty replace last character with a comma
    with open(file_name, 'r') as file:
        # Read the file
        file_content = file.read()
    
    # if file is not empty replace last "[" with a comma
    if file_content:
        file_content = file_content[:-2] + ","
        append = True
    else:
        file_content = ""
    
    # Open the file in append mode to add additional content instead of overwriting the content
    with open(file_name, 'w') as file:
        # Use the write() method to write the string to the file
        if append:
            file.write(file_content[1:]+output)
        file.write(file_content+output)

In [198]:
import json

# Path to the existing JSON file


# Path to the new JSON data file you want to append
new_data_file_path = 'new_data.json'

# Read the existing JSON file
with open(file_name, 'r') as file:
    existing_data = json.load(file)

new_data = json.loads(output)

# Combine the data
existing_data.extend(new_data)  # You can use list concatenation with `+` if you prefer


# Write the combined list back to the original file (or a new file, if desired)
with open(file_name, 'w') as file:
    json.dump(existing_data, file, indent=4)  # `indent` for pretty printing, optional
