In [1]:
from datetime import datetime, timedelta, timezone
import os
import io
import hydra
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose

import dr_util.file_utils as fu
import bytom.author_profiles as ap

from IPython.display import Markdown, display

%load_ext autoreload
%autoreload 2

In [2]:
conf_path = "../configs/"
with initialize(config_path=conf_path, version_base=None):
    os.makedirs(os.path.dirname(conf_path), exist_ok=True)
    cfg = compose(config_name="paper_data.yaml")

In [3]:
# CHANGE the data dir path
cfg.data_dir = "/Users/danielapintoveizaga/Documents/github/by_tomorrow/data/"
cfg_resolved = OmegaConf.to_container(cfg, resolve=True)
print(f"Configuration: \n\n{OmegaConf.to_yaml(cfg_resolved)}")

Configuration: 

data_dir: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/
raw_pdf_dir: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/raw_pdfs/
parsed_pdf_dir: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/parsed_pdfs/
metadata_dir: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/parsed_pdfs/
author_data_dir: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/author_data/
author_summaries_dir: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/author_data/summaries/
author_info_file: /Users/danielapintoveizaga/Documents/github/by_tomorrow/data/author_data/manual_profiles.json
prof_pattern: (?P<professor_name>[\w_]+)
file_type_pattern: (?P<file_type>\w+)
version_pattern: v(?P<version>\d+)
author_summary_file_pattern: (?P<professor_name>[\w_]+)\.(?P<file_type>\w+)\.v(?P<version>\d+)



In [4]:
#CHANGE this

names_info = {
    "Aviad Levis": {
        "bio": ""
    },
    "Doug Downey": {
        "bio": ""
    },
    "Heng Ji": {
        "bio": ""
    },
    "Jeff Clune": {
        "bio": ""
    }
}

#Call the function
ap.save_info_json(cfg, names_info)

In [5]:
AUTHOR_INFO = fu.load_file(cfg.author_info_file)

In [6]:
print("Authors:")
for k in AUTHOR_INFO.keys():
    print(f" - {k}")

Authors:
 - Aviad Levis
 - Doug Downey
 - Heng Ji
 - Jeff Clune


In [7]:
aviad = ap.get_author_papers('Heng Ji')

In [8]:
aviad

[{'title': 'Scaling Laws for Predicting Downstream Performance in LLMs',
  'abstract': 'Precise estimation of downstream performance in large language models (LLMs) prior to training is essential for guiding their development process. Scaling laws analysis utilizes the statistics of a series of significantly smaller sampling language models (LMs) to predict the performance of the target LLM. For downstream performance prediction, the critical challenge lies in the emergent abilities in LLMs that occur beyond task-specific computational thresholds. In this work, we focus on the pre-training loss as a more computation-efficient metric for performance estimation. Our two-stage approach consists of first estimating a function that maps computational resources (e.g., FLOPs) to the pre-training Loss using a series of sampling models, followed by mapping the pre-training loss to downstream task Performance after the critical "emergent phase". In preliminary experiments, this FLP solution accu

In [170]:
display(Markdown(ap.format_response_abstract_to_markdown(aviad[1])))

### **Title:** MentalArena: Self-play Training of Language Models for Diagnosis and
  Treatment of Mental Health Disorders

**Publish Date:** 2024-10-09

**First Author:** Cheng Li

**Last Author:** Heng Ji

**Middle Authors:** May Fung, Qingyun Wang, Chi Han, Manling Li, Jindong Wang

**Abstract:** Mental health disorders are one of the most serious diseases in the world. Most people with such a disease lack access to adequate care, which highlights the importance of training models for the diagnosis and treatment of mental health disorders. However, in the mental health domain, privacy concerns limit the accessibility of personalized treatment data, making it challenging to build powerful models. In this paper, we introduce MentalArena, a self-play framework to train language models by generating domain-specific personalized data, where we obtain a better model capable of making a personalized diagnosis and treatment (as a therapist) and providing information (as a patient). To accurately model human-like mental health patients, we devise Symptom Encoder, which simulates a real patient from both cognition and behavior perspectives. To address intent bias during patient-therapist interactions, we propose Symptom Decoder to compare diagnosed symptoms with encoded symptoms, and dynamically manage the dialogue between patient and therapist according to the identified deviations. We evaluated MentalArena against 6 benchmarks, including biomedicalQA and mental health tasks, compared to 6 advanced models. Our models, fine-tuned on both GPT-3.5 and Llama-3-8b, significantly outperform their counterparts, including GPT-4o. We hope that our work can inspire future research on personalized care. Code is available in https://github.com/Scarelette/MentalArena/tree/main

---------------




In [9]:
ap.write_author_page(
    cfg, "Heng Ji", version='1',
    max_papers=100,
    max_years=5,
    first_last_only=True,
)

In [10]:
display(Markdown(
    ap.make_author_page(
        cfg, "Heng Ji", max_papers=3, max_years=5, first_last_only=True,
    )
))
#fu.dump_file(author_page, f"{cfg.author_summaries_dir}he_he.markdown.v1.txt")

# Research Summary for **Heng Ji**

## Heng Ji Bio

{'bio': ''}

## Recent Papers

### **Title:** Scaling Laws for Predicting Downstream Performance in LLMs

**Publish Date:** 2024-10-11

**First Author:** Yangyi Chen

**Last Author:** Heng Ji

**Middle Authors:** Binxuan Huang, Yifan Gao, Zhengyang Wang, Jingfeng Yang

**Abstract:** Precise estimation of downstream performance in large language models (LLMs) prior to training is essential for guiding their development process. Scaling laws analysis utilizes the statistics of a series of significantly smaller sampling language models (LMs) to predict the performance of the target LLM. For downstream performance prediction, the critical challenge lies in the emergent abilities in LLMs that occur beyond task-specific computational thresholds. In this work, we focus on the pre-training loss as a more computation-efficient metric for performance estimation. Our two-stage approach consists of first estimating a function that maps computational resources (e.g., FLOPs) to the pre-training Loss using a series of sampling models, followed by mapping the pre-training loss to downstream task Performance after the critical "emergent phase". In preliminary experiments, this FLP solution accurately predicts the performance of LLMs with 7B and 13B parameters using a series of sampling LMs up to 3B, achieving error margins of 5% and 10%, respectively, and significantly outperforming the FLOPs-to-Performance approach. This motivates FLP-M, a fundamental approach for performance prediction that addresses the practical need to integrate datasets from multiple sources during pre-training, specifically blending general corpora with code data to accurately represent the common necessity. FLP-M extends the power law analytical function to predict domain-specific pre-training loss based on FLOPs across data sources, and employs a two-layer neural network to model the non-linear relationship between multiple domain-specific loss and downstream performance. By utilizing a 3B LLM trained on a specific ratio and a series of smaller sampling LMs, FLP-M can effectively forecast the performance of 3B and 7B LLMs across various data mixtures for most benchmarks within 10% error margins.

---------------


### **Title:** MentalArena: Self-play Training of Language Models for Diagnosis and
  Treatment of Mental Health Disorders

**Publish Date:** 2024-10-09

**First Author:** Cheng Li

**Last Author:** Heng Ji

**Middle Authors:** May Fung, Qingyun Wang, Chi Han, Manling Li, Jindong Wang

**Abstract:** Mental health disorders are one of the most serious diseases in the world. Most people with such a disease lack access to adequate care, which highlights the importance of training models for the diagnosis and treatment of mental health disorders. However, in the mental health domain, privacy concerns limit the accessibility of personalized treatment data, making it challenging to build powerful models. In this paper, we introduce MentalArena, a self-play framework to train language models by generating domain-specific personalized data, where we obtain a better model capable of making a personalized diagnosis and treatment (as a therapist) and providing information (as a patient). To accurately model human-like mental health patients, we devise Symptom Encoder, which simulates a real patient from both cognition and behavior perspectives. To address intent bias during patient-therapist interactions, we propose Symptom Decoder to compare diagnosed symptoms with encoded symptoms, and dynamically manage the dialogue between patient and therapist according to the identified deviations. We evaluated MentalArena against 6 benchmarks, including biomedicalQA and mental health tasks, compared to 6 advanced models. Our models, fine-tuned on both GPT-3.5 and Llama-3-8b, significantly outperform their counterparts, including GPT-4o. We hope that our work can inspire future research on personalized care. Code is available in https://github.com/Scarelette/MentalArena/tree/main

---------------


### **Title:** Self-Correction is More than Refinement: A Learning Framework for Visual
  and Language Reasoning Tasks

**Publish Date:** 2024-10-05

**First Author:** Jiayi He

**Last Author:** Heng Ji

**Middle Authors:** Hehai Lin, Qingyun Wang, Yi Fung

**Abstract:** While Vision-Language Models (VLMs) have shown remarkable abilities in visual and language reasoning tasks, they invariably generate flawed responses. Self-correction that instructs models to refine their outputs presents a promising solution to this issue. Previous studies have mainly concentrated on Large Language Models (LLMs), while the self-correction abilities of VLMs, particularly concerning both visual and linguistic information, remain largely unexamined. This study investigates the self-correction capabilities of VLMs during both inference and fine-tuning stages. We introduce a Self-Correction Learning (SCL) approach that enables VLMs to learn from their self-generated self-correction data through Direct Preference Optimization (DPO) without relying on external feedback, facilitating self-improvement. Specifically, we collect preferred and disfavored samples based on the correctness of initial and refined responses, which are obtained by two-turn self-correction with VLMs during the inference stage. Experimental results demonstrate that although VLMs struggle to self-correct effectively during iterative inference without additional fine-tuning and external feedback, they can enhance their performance and avoid previous mistakes through preference fine-tuning when their self-generated self-correction data are categorized into preferred and disfavored samples. This study emphasizes that self-correction is not merely a refinement process; rather, it should enhance the reasoning abilities of models through additional training, enabling them to generate high-quality responses directly without further refinement.

---------------




## Utils

In [None]:
def get_author_metadata_path(author):
    assert author in AUTHORS
    return f'{METADATA_DIR}{author.replace(" ", "_")}_query_metadata.json'

def get_author_metadata(author):
    md_path = get_author_metadata_path(author)
    md = fu.load_file(md_path)
    return md

In [None]:
def get_parsed_pdf_path(pdf_name):
    return f'{PARSED_PDF_DIR}{pdf_name}.pkl'

def get_parsed_pdf(pdf_name):
    ppdf_path = get_parsed_pdf_path(pdf_name)
    if os.path.exists(ppdf_path):
        return fu.load_file(ppdf_path)
    return None

In [None]:
def get_author_parsed_papers(author):
    md = get_author_metadata(author)
    pdfs_dict = md['pdfs_metadata']
    parsed_pdfs_dict = []
    for pdf_name, pdf_data in pdfs_dict.items():
        ppdf = get_parsed_pdf(pdf_name)
        if ppdf is None:
            continue
        ppdf_dict = {**pdf_data}
        ppdf_dict['parsed_pdf'] = ppdf
        parsed_pdfs_dict.append(ppdf_dict)
    return parsed_pdfs_dict

## Load Parsed, Extract Structure

In [None]:
parsed_pdfs_pavel = get_author_parsed_papers(AUTHORS[0])
print(f">> Number of parsed papers for {AUTHORS[0]}: {len(parsed_pdfs_pavel)}")

In [None]:
test_ppdf = parsed_pdfs_pavel[0]

In [None]:
print(test_ppdf['title'])
print(test_ppdf['published'])
print(test_ppdf['authors'])
print(test_ppdf['pdf_link'])
print(f">> Num blocks in parsed pdf: {len(test_ppdf['parsed_pdf'])}")

### Utils

In [None]:
def reconstruct_split_text(split_text, verbose=False):
    buff = io.StringIO()
    for section in split_text:
        if verbose:
            buff.write(f"\n\n ===== Heading: {section['heading']} \n\n")
        buff.write("\n\n".join(section['lines']))
        buff.write("\n\n")
    return buff.getvalue()

In [None]:
def split_by_heading(text, title):
    tls = text.split("\n")
    title_str = f"# {title}"
    sections = []

    start_tl_strip = tls[0].strip()
    if title_str in start_tl_strip or start_tl_strip[0] != "#":
        start_heading = "From Previous Block"
        start_lines = []
    else:
        start_heading = start_tl_strip[2:]
        start_lines = [start_tl_strip]
        
    curr_section = {"heading": start_heading, "lines": start_lines}
    for tl in tls[1:]:
        tl_strip = tl.strip()
        if len(tl_strip) == 0 or tl_strip[0].isdigit():
            continue

        if tl_strip[0] == "#":
            # Drop all header mentions of the title, we'll add it back in
            if title_str in tl_strip:
                continue
            # Otherwise start a new section
            sections.append(curr_section)
            curr_section = {"heading": tl_strip[2:], "lines": []}
        curr_section['lines'].append(tl_strip)
    
    sections.append(curr_section)
    return sections

In [None]:
def get_all_sects(input_ppdf, input_title):
    all_sects = []
    for i, block in enumerate(input_ppdf):
        sects = split_by_heading(block.text, input_title)
        if i == 0:
            # Drop the title section
            all_sects.extend(sects[1:])
        else:
            all_sects.extend(sects)
    return all_sects

In [None]:
def group_sections(sections):
    grouped_sections = []

    figs = []
    last_was_fig = False
    for section in sections:
        if len(section['lines']) == 0:
            continue
            
        heading = section['heading']
        
        # For ease of reading split the starting case out
        if len(grouped_sections) == 0:
            grouped_sections.append({
                'heading': heading,
                'lines': [],
            })
            
        if heading.startswith("Figure"):
            figs.append(section)
            last_was_fig = True
            continue

        if last_was_fig:
            last_was_fig = False
            if len(section['lines']) == 0:
                print(section)
                assert False
            if len(section['lines'][0]) == 0:
                print(section)
                assert False
            if section['lines'][0][0].islower():
                first_l = f"{section['heading']} {section['lines'][0]}"
                grouped_sections[-1]['lines'].append(first_l)
                grouped_sections[-1]['lines'].extend(section['lines'][1:])
                continue
        
        if (heading != "From Previous Block" and
            grouped_sections[-1]['heading'] != heading
        ):
            grouped_sections.append({
                'heading': heading,
                'lines': [],
            })
        grouped_sections[-1]['lines'].extend(section['lines'])    
    return grouped_sections, figs

In [None]:
def ppdf_to_body_refs_figs(input_ppdf):
    all_s = get_all_sects(input_ppdf['parsed_pdf'], input_ppdf['title'])
    print(f">> There are {len(all_s)} sections total.")

    grouped_s, figs_s = group_sections(all_s)
    print(f">> There are {len(grouped_s)} grouped sections and {len(figs_s)} figures.")

    body_s = []
    references = None
    for s in grouped_s:
        if 'References' in s['heading']:
            references = s
            break
        body_s.append(s)
    return body_s, figs_s, references

## Test Full Flow

In [None]:
bd_s, fg_s, rfs = ppdf_to_body_refs_figs(test_ppdf)

In [None]:
# print(reconstruct_split_text(bd_s + fg_s))
# rfs

In [None]:
for gt in bd_s:
    print(f"{len(gt['lines']):2} | {gt['heading']}")

### Sub Section Tests

In [None]:
all_sections_test = get_all_sects(test_ppdf['parsed_pdf'], test_ppdf['title'])
print(f">> There are {len(all_sections_test)} sections total")

In [None]:
grouped_test, figs_test = group_sections(all_sections_test)
print(f">> There are {len(grouped_test)} grouped sections")

In [None]:
for gt in grouped_test:
    print(f"{len(gt['lines']):3} | {gt['heading']}")

## Putting it All Together

In [None]:
def make_author_page(author):
    bio = AUTHOR_INFO[author]
    
    buff = io.StringIO()
    buff.write(f"# Research Summary for {author}\n\n")
    buff.write(f"## Bio\n{bio}\n\n")
    

    buff.write("## Recent Papers\n\n")
    parsed_pdfs_author = get_author_parsed_papers(author)
    for ppdf in parsed_pdfs_author:
        buff.write(f"# Title: {ppdf['title']}\n Published: {ppdf['published']}\n")
        buff.write("Authors: " + ", ".join(ppdf['authors']) + "\n\n")

        bd_s, fg_s, rfs = ppdf_to_body_refs_figs(ppdf)
        buff.write(reconstruct_split_text(bd_s + fg_s))# + [rfs]))
        buff.write(f"\n\n -------------- End Paper: {ppdf['title']}")
    return buff.getvalue()

In [None]:
bio_and_one_paper = make_author_page(AUTHORS[0])

In [None]:
fu.dump_file(bio_and_one_paper, '/Users/daniellerothermel/drotherm/data/pavel_izmailov_summary_markdown.txt')

In [None]:
for author in AUTHORS:
    bio_and_one_paper = make_author_page(author)
    fu.dump_file(bio_and_one_paper, f'/Users/daniellerothermel/drotherm/data/{author.replace(" ", "_").lower()}_summary_markdown.txt', verbose=True)
    