In [32]:
import pandas as pd
import glob
from functools import reduce

reprocess = False

def read_year_df_topic(years=[2021, 2022, 2023]):
    files = [f"data/topics/{year}.csv" for year in years]
    df_list = [pd.read_csv(f) for f in files]
    df = reduce(lambda df1, df2: df1.append(df2, ignore_index=True), df_list)
    
    # Add google scholar link by title
    df['google_scholar_link'] = df['title'].apply(lambda x: 'https://scholar.google.com/scholar?q=' + x.replace(' ', '+'))

    # Remove duplicates by title
    df = df.drop_duplicates(subset=['title'])
    return df

df_all = read_year_df_topic([2021, 2022, 2023])
df_all.to_csv('data/topics/all_topics.csv', index=False)
len(df_all), df_all.columns

(14323,
 Index(['title', 'year', 'source', 'authors', 'class', 'keywords', 'abstract',
        'pdf_link', 'topic', 'google_scholar_link'],
       dtype='object'))

# Multi-modal

In [44]:
def filter_by_topic(df, topic):
    return df[df['topic'].str.contains(topic, case=False, na=False)].copy()

In [49]:
df_2022 = pd.read_csv('data/topics/2022.csv')
df_2022_multimodal = filter_by_topic(df_2022, 'multimodal')
len(df_2022_multimodal), len(df_2022)

(262, 6100)

In [51]:
# Convert to bibtex
def save_to_bibtex(df, save_path='data/topics/2022_multimodal.bib'):
    
    bibtex = df.apply(lambda x: f"@inproceedings{{{x['authors'].split(' ')[0].lower()}{x['year']}{x['title'].split(' ')[0].lower()},\n"
                                        f"  author = {{{x['authors']}}},\n"
                                        f"  title = {{{x['title']}}},\n"
                                        f"  conference = {{{x['source']}}},\n"
                                        f"  year = {{{x['year']}}},\n"
                                        f"  url = {{{x['pdf_link']}}},\n"
                                        f"}}\n", axis=1)
    
    with open(save_path, 'w') as f:
        f.write('\n'.join(bibtex))

save_to_bibtex(df_2022_multimodal, save_path='data/topics/2022_multimodal.bib')


# Diffusion models

In [15]:
df_diffm = filter_by_topic(df_all, 'diffusion models')
df_diffm.to_csv('data/topics/diffm.csv', index=False)
len(df_diffm)

113

# LLM

In [16]:
df_llm = filter_by_topic(df_all, 'llm')
df_llm.to_csv('data/topics/llm.csv', index=False)
len(df_llm)

979