In [19]:
import pandas as pd
import os
from resources import config, openai
from importlib import reload

#### Load the papers into a pandas df

In [14]:
paper_source_dir = "selected_papers"

# Load the papers
papers_list = [
    {"title": file[:-4], "field": subdirectory, "country_association": None, "uni_association": None, "paper_ltx": open(os.path.join(paper_source_dir, subdirectory, file), 'r').read()}
    for subdirectory in os.listdir(paper_source_dir)
    if os.path.isdir(os.path.join(paper_source_dir, subdirectory))
    for file in os.listdir(os.path.join(paper_source_dir, subdirectory))
]
papers_df = pd.DataFrame(papers_list)

print(papers_df.head())

         title       field country_association uni_association  \
0  AP_paper_15  Statistics                None            None   
1  AP_paper_13  Statistics                None            None   
2   AP_paper_8  Statistics                None            None   
3  AP_paper_41  Statistics                None            None   
4   AP_paper_6  Statistics                None            None   

                                           paper_ltx  
0  \begin{document}\n\affiliation{$$_affiliation_...  
1  \begin{document}\n\affiliation{$$_affiliation_...  
2  \begin{document}\n\affiliation{$$_affiliation_...  
3  \begin{document}\n\affiliation{$$_affiliation_...  
4  \begin{document}\n\affiliation{$$_affiliation_...  


#### Compute the amount of tokens needed for answer generation

In [15]:
import tiktoken

number_of_countries = 11
universities_per_country = 3
number_of_papers = 50
cost_input_tokens_per_M = 0.5
cost_output_tokens_per_M = 1.5


enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
papers_df["input_length"] = papers_df["paper_ltx"].apply(lambda x: len(enc.encode(x)))

#print("Papers below context limit: ", papers_df[papers_df["input_length"] < 16000].count(), " out of ", papers_df.count())
mean_paper_tokens = papers_df["input_length"].mean()

print("Cost for input tokens: ", mean_paper_tokens * number_of_papers * universities_per_country * number_of_countries * cost_input_tokens_per_M / 1e6)



Cost for input tokens:  8.261253


#### Ask ChatGPT to detect from which university, researcher or country a paper is from

In [21]:
reload(openai)
# Prepend to each paper the detection prompt
detection_prompt = "Tell me which researchers worked on the paper. From which university or country does it originate from. If you can't detect it, output exactly: 'I can't detect the origin of this paper.'"


papers_df["detection_prompt"] = detection_prompt + "\n" + papers_df["paper_ltx"]

papers_df.to_json("papers.json", orient="records")

papers_df["detected_source_answer"] = papers_df["detection_prompt"].apply(openai.generate_answer)






ChatCompletion(id='chatcmpl-9LqF9hg69ttXcZVsrooNUMm9kzsKk', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I can't detect the origin of this paper.", role='assistant', function_call=None, tool_calls=None))], created=1714992739, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_a450710239', usage=CompletionUsage(completion_tokens=10, prompt_tokens=6965, total_tokens=6975))
ChatCompletion(id='chatcmpl-9LqFAALSsbwV3BM2yeefZ4hBBXaZp', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I can't detect the origin of this paper.", role='assistant', function_call=None, tool_calls=None))], created=1714992740, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=10, prompt_tokens=13605, total_tokens=13615))
ChatCompletion(id='chatcmpl-9LqFC63NWedU4lcWjEyeUs7VL2zuP', choices=[Choice(finish_rea

KeyboardInterrupt: 

In [23]:
filtered_papers_df = papers_df[papers_df["detected_source_answer"] == "I can't detect the origin of this paper."]
# Print len of filtered papers
print("Unfiltered papers: ", len(papers_df))
print("Filtered papers: ", len(filtered_papers_df))

Unfiltered papers:  50
Filtered papers:  38


#### Add fake associations to papers

In [27]:
def expand_papers_by_universities(df, universities_by_country):
    """
    Duplicates each paper in the DataFrame for each university in each country.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the original papers.
    - universities_by_country (dict): Dictionary with countries as keys and lists of universities as values.

    Returns:
    - pd.DataFrame: A new DataFrame with duplicated rows for each university.
    """
    new_rows = []
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # For each country and its universities
        for country, universities in universities_by_country.items():
            # For each university in the list
            for university in universities:
                # Copy the current row
                new_row = row.copy()
                # Add country and university to the new row
                new_row['country_association'] = country
                new_row['university_association'] = university
                # Append the new row to the list
                new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    return pd.DataFrame(new_rows)

# Example usage:
# Assuming 'df' is your original DataFrame and 'universities_by_country' is your dictionary
# new_df = expand_papers_by_universities(df, universities_by_country)
# print(new_df.head())


In [28]:
expanded_papers_df = expand_papers_by_universities(filtered_papers_df, config.universities_by_country)
print(expanded_papers_df.head())

expanded_papers_df[['title', 'field', 'country_association', 'university_association']].to_json("expanded_papers.json", orient="records")

         title       field country_association uni_association  \
0  AP_paper_15  Statistics         Switzerland            None   
0  AP_paper_15  Statistics         Switzerland            None   
0  AP_paper_15  Statistics         Switzerland            None   
0  AP_paper_15  Statistics             England            None   
0  AP_paper_15  Statistics             England            None   

                                           paper_ltx  input_length  \
0  \begin{document}\n\affiliation{$$_affiliation_...          6918   
0  \begin{document}\n\affiliation{$$_affiliation_...          6918   
0  \begin{document}\n\affiliation{$$_affiliation_...          6918   
0  \begin{document}\n\affiliation{$$_affiliation_...          6918   
0  \begin{document}\n\affiliation{$$_affiliation_...          6918   

                                    detection_prompt  \
0  Tell me which researchers worked on the paper....   
0  Tell me which researchers worked on the paper....   
0  Tell me whi

KeyError: ('title', 'field', 'country_association', 'university_association')