In [3]:
import pandas as pd
import glob
from functools import reduce

reprocess = False

def read_year_df_topic(years=[2021, 2022, 2023]):
    files = [f"data/topics/{year}.csv" for year in years]
    df_list = [pd.read_csv(f) for f in files]
    df = reduce(lambda df1, df2: df1.append(df2, ignore_index=True), df_list)
    
    # Add google scholar link by title
    df['google_scholar_link'] = df['title'].apply(lambda x: 'https://scholar.google.com/scholar?q=' + x.replace(' ', '+'))

    # Remove duplicates by title
    df = df.drop_duplicates(subset=['title'])
    return df

df_all = read_year_df_topic([2021, 2022, 2023])
df_all.to_csv('data/topics/all_topics.csv', index=False)
len(df_all), df_all.columns

(14323,
 Index(['title', 'year', 'source', 'authors', 'class', 'keywords', 'abstract',
        'pdf_link', 'topic', 'google_scholar_link'],
       dtype='object'))

# Multi-modal

In [4]:
def filter_by_topic(df, topic):
    return df[df['topic'].str.contains(topic, case=False, na=False)]

In [5]:
df_multimodal = filter_by_topic(df_all, 'multimodal')
df_multimodal.to_csv('data/topics/multimodal.csv', index=False)
len(df_multimodal)

591

In [27]:
# download google scholar citations
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np

def get_citations(title):
    time.sleep(np.random.randint(1, 5))
    url = 'https://scholar.google.com/scholar?hl=en&q=' + title.replace(' ', '+')
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    div = soup.find('div', {'id': 'gs_res_ccl_mid'})
    if div is None:
        return 0
    a = div.find('a', {'href': re.compile('cites=')})
    if a is None:
        return 0
    return int(a.text.split(' ')[-1])

In [30]:
get_citations('Cross-Modal+Retrieval+Augmentation+for+Multi-Modal+Classification')

0

In [28]:
from tqdm import tqdm

title2cites = {}
counter = 1
for title in tqdm(df_all['title']):
    n_cite = get_citations(title)
    title2cites[title] = n_cite
    if np.random.randint(1, 10) >= 9:
        time.sleep(np.random.randint(1, 5))
        counter += 1
        df_title2cites = pd.DataFrame.from_dict(title2cites, orient='index', columns=['citations'])
        df_title2cites.to_csv(f'data/topics/multimodal_title2cites_{counter}.csv', index=True)
        title2cites = {}

  0%|          | 33/14323 [02:14<16:12:22,  4.08s/it]


KeyboardInterrupt: 

In [20]:
sum([v >0 for v in title2cites.values()])

61

In [9]:
# show full title
pd.set_option('display.max_colwidth', None)
df_multimodal.title.head()

24                                                                          Pixel-Aligned Volumetric Avatars
25                                 UC2: Universal Cross-Lingual Cross-Modal Vision-and-Language Pre-Training
97     DRANet: Disentangling Representation and Adaptation Networks for Unsupervised Cross-Domain Adaptation
114                             Can Audio-Visual Integration Strengthen Robustness Under Multimodal Attacks?
213     Informative and Consistent Correspondence Mining for Cross-Domain Weakly Supervised Object Detection
Name: title, dtype: object

# Diffusion models

In [15]:
df_diffm = filter_by_topic(df_all, 'diffusion models')
df_diffm.to_csv('data/topics/diffm.csv', index=False)
len(df_diffm)

113

# LLM

In [16]:
df_llm = filter_by_topic(df_all, 'llm')
df_llm.to_csv('data/topics/llm.csv', index=False)
len(df_llm)

979