In [1]:
import os
import json
import random
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

In [2]:
random.seed(42)  # For reproducibility

In [4]:
def filter_sample_save(df, path):
    
    # find users who interacted a few times
    user_counts = df['user_id'].value_counts()
    users_with_few_interactions = user_counts[user_counts > 2].index
    
    # sample 100 of these users
    sampled_users = random.sample(list(users_with_few_interactions), 100)
    
    # Filter the DataFrame to include only those users
    df_filtered = df[df['user_id'].isin(sampled_users)].copy()
    
    if 'query_id' in df_filtered.columns:
        df_filtered.sort_values(by=['user_id', 'query_id'], inplace=True)
    else:
        df_filtered.sort_values(by=['user_id'], inplace=True)
    df_filtered.to_csv(os.path.join('data', path), index=False)
    

## Open Scholar

In [None]:
# load Open Scholar data
_df = pd.read_csv(os.path.join('data', 'OS_queries_0225.csv'))
_df['text'] = [json.loads(r)['text'] for r in _df['response']]
_df.drop(columns=['response'], inplace=True)

In [None]:
filter_sample_save(_df, 'OS_queries_0225_filtered_sampled.csv')

## ScholarQA

In [4]:
with open(os.path.join('data', 'scholarqa_queries_and_answers_0227.jsonl'), 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

In [5]:
new_data = []
for item in data:
        
    answer = ''
    for section in item['sections']:
        answer += section['text']
        
    new_item = {
        'user_id': item['user_id'],
        'query_id': item['query_id'],
        'query': item['query'],
        'answer': answer,
        'feedback': item['feedback'],
    }
    new_data.append(new_item)

In [6]:
# load ScholarQA data
_df = pd.DataFrame(new_data)

In [9]:
with open(os.path.join('data', 'scholarqa_queries_and_answers_0227_flat.jsonl'), 'w', encoding='utf-8') as out_f:
    for entry in new_data:
        out_f.write(json.dumps(entry, ensure_ascii=False) + '\n')

In [7]:
filter_sample_save(_df, 'scholarqa_queries_and_answers_0227_filtered_sampled.csv')

## Attempt at finding queries with incorrect terminology

In [27]:
with open(os.path.join('output', 'scholarqa_llm_annotations2.jsonl'), 'r', encoding='utf-8') as f:
    for line in f:
        d = json.loads(line)
        if d['label'] == 'TRUE':
            print('query:', d['query'], 'label:', d['label'])

query: intersection between identity, ideology, and power label: TRUE
query: positive education for technostress mitgation  in school education
 label: TRUE
query: What is binder with codex label: TRUE
query: in reinforcement leanring, when using actor critic on discrete action space, the actor often learns and gets stuck at a very concetrated softmax distribution and can't recover due to vanishing gradients, how to address that? label: TRUE
query: which approach is better for implicit depth deep equilibrium models or universal transformer? label: TRUE
query: I'm studying the challenges with long-form writing for blind academics. My background is a PhD in human-computer interaction and I'm primarily a technical HCI researcher who builds systems. Provide me papers and theories about long-form writing and the challenges for blind workers, and systems that are designed to help  label: TRUE
query: I'm a researcher trying to understand the process and challenges of long-form writing, like a

## From CSV to JSONL

In [15]:
os_df = pd.read_csv(os.path.join('data', 'OS_queries_0225_annotated.csv')).dropna()
os_df['label'] = os_df['specificity'].replace({3.0: '1', 4.0: '1', 2.0: '0'}).astype(int)
os_df['query_id'] = 'os_query_' + os_df.index.astype(str)
print(len(os_df))
os_df.head()

64


Unnamed: 0,date,query,other_field,specificity,user_id,text,label,query_id
0,2025-01-10,Using machine learning to predict patients wit...,no,3.0,0165e8ab-ea86-4fc1-aed8-9025fbc25ca4,\n\nMachine learning has been widely used in p...,1,os_query_0
1,2025-01-10,Using machine learning to predict patients wit...,no,4.0,0165e8ab-ea86-4fc1-aed8-9025fbc25ca4,\nMachine learning algorithms have been increa...,1,os_query_1
2,2025-01-10,“Cervical intraepithelial neoplasia lesions v...,no,2.0,0165e8ab-ea86-4fc1-aed8-9025fbc25ca4,\nCervical intraepithelial neoplasia (CIN) is ...,0,os_query_2
3,2025-01-10,machine learning polycystic ovary disease,no,2.0,0165e8ab-ea86-4fc1-aed8-9025fbc25ca4,\nMachine learning algorithms have shown great...,0,os_query_3
4,2024-12-22,"Design a biological aphid control strategy, kn...",no,4.0,02e875f6-3b68-45a0-b8ca-561a71139f4d,Response: \n\nConsidering the information prov...,1,os_query_4


In [16]:
sqa_df = pd.read_csv(os.path.join('data', 'scholarqa_queries_and_answers_0227_annotated.csv')).drop(columns=['other domain']).dropna()
sqa_df['label'] = sqa_df['specificity'].replace({3.0: '1', 4.0: '1', 2.0: '0'}).astype(int)
sqa_df['query_id'] = 'sqa_query_' + sqa_df.index.astype(str)
print(len(sqa_df))
sqa_df.head()

22


Unnamed: 0,user_id,query_id,query,answer,feedback,specificity,label
0,02551141-be85-58c2-880f-e9cb1d0a1d51,sqa_query_0,how does chatgpt handle memory of past convers...,\nChatGPT's memory system presents an interest...,[],3.0,1
1,02551141-be85-58c2-880f-e9cb1d0a1d51,sqa_query_1,deepseek training specs,\nThe DeepSeek model family began with traditi...,[],2.0,0
2,02551141-be85-58c2-880f-e9cb1d0a1d51,sqa_query_2,best techniques on LLM context windows,\nThe context window of an LLM represents the ...,[],2.0,0
3,02931874-e395-5e28-9589-2306ba48f392,sqa_query_3,the three modes of calibration? marginal exce...,"\nIn machine learning and predictive modeling,...",[],4.0,1
4,02931874-e395-5e28-9589-2306ba48f392,sqa_query_4,experiment people get better with time in diff...,\nThe relationship between practice and perfor...,[],2.0,0


In [17]:
# Concatenate os_df and sqa_df
annotated_df = pd.concat([os_df[['query_id', 'query', 'label']], sqa_df[['query_id', 'query', 'label']]], ignore_index=True)
print(len(annotated_df))
annotated_df

86


Unnamed: 0,query_id,query,label
0,os_query_0,Using machine learning to predict patients wit...,1
1,os_query_1,Using machine learning to predict patients wit...,1
2,os_query_2,“Cervical intraepithelial neoplasia lesions v...,0
3,os_query_3,machine learning polycystic ovary disease,0
4,os_query_4,"Design a biological aphid control strategy, kn...",1
...,...,...,...
81,sqa_query_17,Discuss the role of SHIP1 (Src homology 2 doma...,1
82,sqa_query_18,"why ""self-derived protein scaffold,"" in the co...",1
83,sqa_query_19,"In the context of vaccine development, how do ...",1
84,sqa_query_20,Why are homomeric proteins considered suitable...,1


In [18]:
annotated_df.to_json(os.path.join('data', 'os_sqa_annotated.jsonl'), orient='records', lines=True, force_ascii=False)

In [6]:
# df.to_json(os.path.join('data', 'OS_queries_0225_annotated.jsonl'), orient='records', lines=True, force_ascii=False)

In [21]:
# calculate prevalence of each label
label_counts = annotated_df['label'].value_counts().sort_index()
print(label_counts)
label_percentages = label_counts / label_counts.sum() * 100
label_percentages

label
0    39
1    47
Name: count, dtype: int64


label
0    45.348837
1    54.651163
Name: count, dtype: float64

## Can LLM do this annotation

In [22]:
# Read the JSONL file into a DataFrame
jsonl_df = pd.read_json(os.path.join('output', 'openscholar_gpt4-1_specificity2.jsonl'), lines=True)
jsonl_df['label'] = jsonl_df['label'].astype(str).replace('Label: ', '', regex=False)
jsonl_df.dropna(inplace=True)
len(jsonl_df)

81

In [23]:
jsonl_df['label'].value_counts()

label
1    44
0    37
Name: count, dtype: int64

In [15]:
jsonl_df2 = pd.read_json(os.path.join('output', 'openscholar_ollama_specificity.jsonl'), lines=True).dropna()
jsonl_df2['label'] = jsonl_df2['label'].astype(str).replace('Label: ', '', regex=False).replace('Label:  ', '', regex=False)
jsonl_df2.dropna(inplace=True)
len(jsonl_df2)

59

In [16]:
jsonl_df2['label'].value_counts()

label
2           38
Label: 2    11
1            5
Label: 1     4
3            1
Name: count, dtype: int64

In [13]:
# # Merge with the other DataFrame on 'query_id'
merged_df = pd.merge(annotated_df[5:], jsonl_df, on='query_id', suffixes=('_human', '_llm'))
print(len(merged_df))

merged_df_clean = merged_df.dropna(subset=['label_human', 'label_llm'])
print(len(merged_df_clean))

merged_df_clean['label_human'] = merged_df_clean['label_human'].astype(int)
merged_df_clean['label_llm'] = merged_df_clean['label_llm'].astype(int)

# Calculate F1 score
f1 = f1_score(merged_df_clean['label_human'], merged_df_clean['label_llm'], average='macro')
print("F1 score:", f1)

115
115
F1 score: 0.73816029143898


In [14]:
merged_df_clean

Unnamed: 0,query_id,query_human,label_human,query_llm,label_llm
0,query_5,Mizus persicae and Macrosiphum euphorbiae in b...,0,Mizus persicae and Macrosiphum euphorbiae in b...,1
1,query_5,Mizus persicae and Macrosiphum euphorbiae in b...,0,human LLM experiment on political ideas,0
2,query_6,"Design a biological aphid control strategy, kn...",1,"Design a biological aphid control strategy, kn...",1
3,query_6,"Design a biological aphid control strategy, kn...",1,give me papers related to human LLM collaborat...,1
4,query_7,radiocommunication capex and opex,0,radiocommunication capex and opex,0
...,...,...,...,...,...
110,query_19,"In the context of vaccine development, how do ...",1,"In the context of vaccine development, how do ...",1
111,query_20,Why are homomeric proteins considered suitable...,1,Can you let me know the latest research trend ...,0
112,query_20,Why are homomeric proteins considered suitable...,1,Why are homomeric proteins considered suitable...,1
113,query_35,the social Impact of moroccan Land Reform Pol...,1,how to detect out of distribution samples,0


In [None]:
len

In [25]:
from sklearn.metrics import classification_report

print(classification_report(annotated_df[5:]['label'].astype(str), jsonl_df['label'].astype(str)))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        37
           1       0.86      0.86      0.86        44

    accuracy                           0.85        81
   macro avg       0.85      0.85      0.85        81
weighted avg       0.85      0.85      0.85        81



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_true = merged_df_clean['label_human']
y_pred = merged_df_clean['label_llm']

metrics = {
    'accuracy': accuracy_score(y_true, y_pred),
    'macro_precision': precision_score(y_true, y_pred, average='macro'),
    'macro_recall': recall_score(y_true, y_pred, average='macro'),
    'macro_f1': f1_score(y_true, y_pred, average='macro'),
}

display(pd.DataFrame([metrics]))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
