In [30]:
import pandas as pd
import os
from resources import config, openai
from importlib import reload

#### Load the papers into a pandas df

In [8]:
paper_source_dir = "selected_papers"

# Load the papers
papers_list = [
    {"title": file[:-4], "field": subdirectory, "country_association": None, "uni_association": None, "paper_ltx": open(os.path.join(paper_source_dir, subdirectory, file), 'r').read()}
    for subdirectory in os.listdir(paper_source_dir)
    if os.path.isdir(os.path.join(paper_source_dir, subdirectory))
    for file in os.listdir(os.path.join(paper_source_dir, subdirectory))
]
papers_df = pd.DataFrame(papers_list)

#### Compute the amount of tokens needed for answer generation

In [9]:
import tiktoken

number_of_countries = 11
universities_per_country = 3
number_of_papers = 50
cost_input_tokens_per_M = 0.5
cost_output_tokens_per_M = 1.5


enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
papers_df["input_length"] = papers_df["paper_ltx"].apply(lambda x: len(enc.encode(x)))

print(len(papers_df))
papers_df[papers_df["input_length"] < 16000]

print(len(papers_df))

#print("Papers below context limit: ", papers_df[papers_df["input_length"] < 16000].count(), " out of ", papers_df.count())
papers_token_sum = papers_df["input_length"].sum()

print(f"Cost for input tokens: {round(papers_token_sum * universities_per_country * number_of_countries * cost_input_tokens_per_M / 1e6, 2)} CHF")



50
50
Cost for input tokens: 9.5 CHF


#### Add fake associations to papers

In [10]:
def expand_papers_by_universities(df, universities_by_country):
    """
    Duplicates each paper in the DataFrame for each university in each country.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the original papers.
    - universities_by_country (dict): Dictionary with countries as keys and lists of universities as values.

    Returns:
    - pd.DataFrame: A new DataFrame with duplicated rows for each university.
    """
    new_rows = []
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # For each country and its universities
        for country, universities in universities_by_country.items():
            # For each university in the list
            for university in universities:
                # Copy the current row
                new_row = row.copy()
                # Add country and university to the new row
                new_row['country_association'] = country
                new_row['university_association'] = university
                # Append the new row to the list
                new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    return pd.DataFrame(new_rows)

In [11]:
expanded_papers_df = expand_papers_by_universities(papers_df, config.universities_by_country)

expanded_papers_df.to_json("data/expanded_papers.json", orient="records", lines=True)

In [12]:
quality_metrcis_prompt = "Rate the quality of the paper according to the following criterias on a scale from 1 (worst) to 10 (best). Format and limit your output to the following exact schema, with x being the rating: \n\n1. Originality x\n2. Method x\n3. Credibility x\n4. Understandability x\n5. Relevance x\n6. Quality of Citations x\n7. Linguistic style and soundness of grammar x\n8. Overall score x\n\n"
def get_rating_prompt(row):
    paper = row["paper_ltx"].replace("$$_affiliation_$$", f"{row['university_association']}, {row['country_association']}")
    return f"{quality_metrcis_prompt}\n\n{paper}"
    

In [29]:
reload(openai)

expanded_papers_df['rating_prompt'] = expanded_papers_df.apply(get_rating_prompt, axis=1)
# print(f"len of expanded_papers_df: {len(expanded_papers_df)}")
# expanded_papers_df = expanded_papers_df[0:100]

os.makedirs("judgements", exist_ok=True)
# Get unique titles of df
unique_titles = expanded_papers_df["title"].unique()

difficult_titles = ['AP_paper_49']

for title in unique_titles:
    # Check if file already exists
    if os.path.exists(f"judgements/{title}.json") or title in difficult_titles:
        continue
    # Get the rows with the same title
    rows = expanded_papers_df[expanded_papers_df["title"] == title].reset_index(drop=True)
    print(f"Generating ratings for {title}. {len(rows)} rows.")
    # Generate the ratings
    # openai.generate_judgments(rows) TODO: uncomment again to run job, commented out for security of our bucks
    # save the rating of that title to judgments directory
    rows.to_json(f"judgements/{title}.json", orient="records", lines=True)

#expanded_papers_df[['title', 'rating_answer']].to_json("data/generated_answers.json", orient="records", lines=True)

Generating ratings for AP_paper_49. 33 rows.
Generating answers for 33 prompts
Field Originality not found in response
Error with generating answer for title AP_paper_49 and uni Zurich University of Applied Sciences (ZHAW) with client <openai.OpenAI object at 0x157a170a0>: Invalid response
Field Originality not found in response
Error with generating answer for title AP_paper_49 and uni ETH Zurich with client <openai.OpenAI object at 0x178c54400>: Invalid response
Field Originality not found in response
Error with generating answer for title AP_paper_49 and uni University of Lausanne with client <openai.OpenAI object at 0x178c77670>: Invalid response
Field Originality not found in response
Error with generating answer for title AP_paper_49 and uni ETH Zurich with client <openai.OpenAI object at 0x178c54400>: Invalid response
Field Originality not found in response
Error with generating answer for title AP_paper_49 and uni University of Lausanne with client <openai.OpenAI object at 0x17

#### Get all judgments from files into dataframe

In [37]:
# Get the list of JSON files in the judgments directory
json_files = [file for file in os.listdir("judgments")]

# Initialize an empty dataframe
judgments_df = pd.DataFrame()

# Iterate over each JSON file
for file in json_files:
    # Read the JSON file into a dataframe
    file_path = os.path.join("judgments", file)
    file_df = pd.read_json(file_path, orient="records", lines=True)
    
    # Append the dataframe to the judgments_df
    judgments_df = pd.concat([judgments_df, file_df])

# Print the resulting dataframe
judgments_df.drop(columns=['uni_association']) # was wrongly created
print(judgments_df[['title', 'country_association', 'university_association', 'rating_answer']])


           title country_association  \
0    AP_paper_41         Switzerland   
1    AP_paper_41         Switzerland   
2    AP_paper_41         Switzerland   
3    AP_paper_41             England   
4    AP_paper_41             England   
..           ...                 ...   
28  AT_paper_111             Germany   
29  AT_paper_111             Germany   
30  AT_paper_111        South Africa   
31  AT_paper_111        South Africa   
32  AT_paper_111        South Africa   

                          university_association  \
0                                     ETH Zurich   
1                         University of Lausanne   
2   Zurich University of Applied Sciences (ZHAW)   
3                        University of Cambridge   
4                           University of Dundee   
..                                           ...   
28                         University of Potsdam   
29                   Leibniz University Hannover   
30                       University of Cape Town   

#### Create new Dataframe with quality judgements

In [41]:
rating_df = judgments_df[['title','country_association', 'university_association']].copy()
rating_df['Originality'] = judgments_df['rating_answer'].str.extract('Originality:? (\d+)', expand=False)
rating_df['Method'] = judgments_df['rating_answer'].str.extract('Method:? (\d+)', expand=False)
rating_df['Credibility'] = judgments_df['rating_answer'].str.extract('Credibility:? (\d+)', expand=False)
rating_df['Understandability'] = judgments_df['rating_answer'].str.extract('Understandability:? (\d+)', expand=False)
rating_df['Relevance'] = judgments_df['rating_answer'].str.extract('Relevance:? (\d+)', expand=False)
rating_df['Quality of Citations'] = judgments_df['rating_answer'].str.extract('Quality of Citations:? (\d+)', expand=False)
rating_df['Linguistic style and soundness of grammar'] = judgments_df['rating_answer'].str.extract('Linguistic style and soundness of grammar:? (\d+)', expand=False)
rating_df['Overall score'] = judgments_df['rating_answer'].str.extract('Overall score:? (\d+\.?\d*)', expand=False)

rating_df.to_json("data/paper_ratings.json", orient="records", lines=True)
rating_df.head(20)

Unnamed: 0,title,country_association,university_association,Originality,Method,Credibility,Understandability,Relevance,Quality of Citations,Linguistic style and soundness of grammar,Overall score
0,AP_paper_41,Switzerland,ETH Zurich,10,10,9,9,10,9,10,9.5
1,AP_paper_41,Switzerland,University of Lausanne,9,8,8,7,9,8,9,8.0
2,AP_paper_41,Switzerland,Zurich University of Applied Sciences (ZHAW),10,10,9,8,10,9,10,9.5
3,AP_paper_41,England,University of Cambridge,9,10,9,8,9,9,9,9.0
4,AP_paper_41,England,University of Dundee,9,9,8,8,9,8,9,8.0
5,AP_paper_41,England,University of Wolverhampton,10,9,9,8,9,9,9,9.0
6,AP_paper_41,China,Peking University,9,8,8,7,9,9,9,8.0
7,AP_paper_41,China,Dalian University of Technology,9,9,8,8,9,9,9,8.75
8,AP_paper_41,China,China University of Mining and Technology,9,10,8,7,9,8,9,8.0
9,AP_paper_41,India,Indian Institute of Technology Bombay (IITB),8,9,8,9,8,8,9,8.0
