In [None]:
import inflect
import re
import os
from urllib.request import urlretrieve
from tqdm import tqdm

In [None]:
aclBibPath = "anthology.bib"

In [None]:
with open(aclBibPath, "r", encoding="utf-8") as fp:
    bib_txt = fp.read()

In [None]:
bibs = bib_txt.split("}\n@")

In [None]:
def replace_non_letters_with_spaces(input_string):
    # Replace non-letter characters with spaces
    result_string = re.sub(r'[^a-zA-ZÀ-ÿ ]', ' ', input_string)
    return result_string

In [None]:
with open("ACL_Abstracts.txt", "r") as fp:
    abstracts = fp.read().strip().split("\n")
with open("ACL_URLs.txt", "r") as fp:
    urls = fp.read().strip().split("\n")

In [None]:
languages_to_ignore ="Apache,Laura,Fang,Mono,Ma,Maria,Sam,Bench,Zhuang,Male,Nara,So,Hu,Kim,Label,The,To,Yong,The,To,Adele,Are,Foma,Kaur,Bau,Kato,Dek,Naman,Dom,As,The,To,As,Dan,E,The,To,U,Even,En,Chung,Dong,Shi,Tai,Thompson,Gao,Ir,Pan,Ali,Rao,Han,Doe,Titan,Ha,Sa,Tu,Lau,Siri,Wan,She,Dai,Ding,Kang,Ge,Koch,Che,Mann,Zou,Pei,Yao,Lou,Sydney,Ju,Sha,Day,Miwa,Bai,Ko,Ga,Pal,Pe,Gun,Hung,Con,Cun,Serrano,Sui,Bu,Mehri,Od,Haji,Gal,Gey,Lui,Ho,Furu,Ak,Kao,Aro,Gen,Moro,Notre,Ido,Ron,Were,Bai,Sahu,Dem,Melo,Rama,Hunde,Dii,Yala,Sauri".split(',')
languages_to_ignore=languages_to_ignore+"Uni,One,Yi,Na,Bit,Pa".split(',') + ["are", "as", "e", "en", "even", "one", "so", "to", "apache", "au", "u", "bit", "she", "siri", "day", "gun", "label", "notre"]
languages_to_ignore = set(languages_to_ignore)

In [None]:
len(languages_to_ignore)

In [None]:
", ".join(sorted(languages_to_ignore))

In [None]:
lang_classes = {}
for i in range(6):
    with open(f"LangClasses/{i}.txt", "r", encoding="utf-8") as fp:
        lang_classes[i] = set(fp.read().strip().split("\n"))

In [None]:
url_langs = {}
for line in tqdm(urls):
    title, url = line.split("\t")
    proc_title = set(replace_non_letters_with_spaces(title.replace("{", "").replace("}", "")).split(" "))
    for cls, langs in lang_classes.items():
        for lang in langs:
            if lang in languages_to_ignore:
                continue
            if lang in proc_title:
                url_langs[url] = cls
                # print(title, url, url_langs)
                break
        else:
            continue
        break

In [None]:
for line in tqdm(abstracts):
    title, url, abstract = line.split("\t")
    proc_abs = set(replace_non_letters_with_spaces((title + " " +abstract).replace("{", "").replace("}", "")).split(" "))
    for cls, langs in lang_classes.items():
        for lang in langs:
            if lang in languages_to_ignore:
                continue
            if lang in proc_abs:
                url_langs[url] = cls
                break
        else:
            continue
        break

In [None]:
p = inflect.engine()

conf=["first",'second','third','fourth','fifth','sixth',"seventh","eighth","ninth","tenth","eleventh","twelfth","thirteenth","fourteenth","fifteenth","sixteenth","seventeenth","eighteenth","nineteenth","twentieth"]

capital=[c.title() for c in conf]

conf=conf+capital

otherVenues = set()

mainVenues=[]

for i in range(70):
    conf.append(p.ordinal(i+1))

conf=[" "+c+" " for c in conf]

f=open(aclBibPath, "r", encoding="utf-8")

def simplyfyVenue(x):
  for c in conf:
    if c in x:
      x=x.replace(c, "")
      break
  x=x.split("(")[0].strip()
  for i in range(1900,2021):
    if str(i) in x:
      x=x.replace(str(i), "").strip()
  return(x)

mainVenues=["Main Conference",
            "Annual Meeting of the Association for Computational Linguistics",
            "North American Chapter of the Association for Computational Linguistics",
            "{E}uropean Chapter of the Association for Computational Linguistics",
            "Empirical Methods in Natural Language Processing",
            "International Conference on Computational Linguistics",
            "Conference on Computational Natural Language Learning",
            "International Workshop on Semantic Evaluation",
            "Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
            "Conference on Computational Natural Language Learning"]

def mapVanue(x,isJournal):
  global otherVenues
  if ("Language Resources and Evaluation" in x) or ("LREC" in x):
    return("LREC")
  elif any([v in x for v in mainVenues]):
    mainVenues.append(x)
    return("Main")
  elif isJournal and (("Transactions of the Association for Computational Linguistics" in x) or ("Computational Linguistics" in x)):
    return("Main")
  else:
    otherVenues.add(x)
    return("Other")

def defang(x):
  s=x[x.index("\"")+1:x.rindex("\"")]
  if(len(s)==0):
    s=x[x.index("{")+1:x.rindex("}")]
  return(s)

In [None]:
paperDetails={}
categories = {}
counter=0
for x in f:
  if("booktitle" not in x)and("title =" in x or "Title =" in x):
    title=defang(x)
    counter=1
  elif("booktitle" in x):
    place=defang(x)
    place=mapVanue(place,False)
    counter=counter+1
  elif ("journal =" in x):
    place=defang(x)
    place=mapVanue(place,True)
    counter=counter+1
  elif("year =" in x):
    year=int(defang(x))
    counter=counter+1
  elif("url =" in x):
    url=defang(x)
    counter=counter+1
  if(counter==4):
    paperDetails[title]=[place,year,url]
    if year >= 2015 and url in url_langs:
      lang_class = url_langs[url]
      if place not in categories:
        categories[place] = {}
      if lang_class not in categories[place]:
        categories[place][lang_class] = {}        
      if year not in categories[place][lang_class]:
        categories[place][lang_class][year] = set()
      categories[place][lang_class][year].add(url)
      # pdf_dir = os.path.join(save_dir, str(lang_class), str(year))
      # url += "" if url.endswith(".pdf") else ".pdf"
      # if not os.path.exists(pdf_dir):
      #   os.makedirs(pdf_dir, exist_ok=True)
      # try:
      #   urlretrieve(url, os.path.join(pdf_dir, os.path.basename(url)))
      # except Exception as e:
      #   print(url, e)
    counter=0

mainVenues=list(set(mainVenues))

In [None]:
def simplyfyVenue(x):
  for c in conf:
    if c in x:
      x=x.replace(c, "")
  x=x.split("(")[0].strip()
  for i in range(1900,2021):
    if str(i) in x:
      x=x.replace(str(i), "").strip()
  x = x.replace(f"Actes de la ", "")
  for i in range(30):
    x = x.replace("\\", "").replace(f"{i}`eme ", "").replace(f"Actes de la {i}e ", "").replace(f"{i}e", "").replace(f"Volume {i}", "").replace(f"volume {i}", "")
  if ":" in x:
    x = x.split(":")[0]
  return(x)

In [None]:
otherVenues_simplified = set(map(lambda x: simplyfyVenue(x.replace("{", "").replace("}", "")), otherVenues))
otherVenues_sorted = sorted(otherVenues_simplified)
with open("other_venue_list.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(otherVenues_sorted).replace("  ", " "))

In [None]:
import pickle

with open("categories.pkl", "wb") as fp:
    pickle.dump(categories, fp)

In [None]:
import pickle

with open("categories.pkl", "rb") as fp:
    categories = pickle.load(fp)

In [None]:
categories["LREC"][0][2022]

In [None]:
place, year, url
# ('Other', 2020, 'https://aclanthology.org/2020.findings-emnlp.425.pdf')

In [None]:
import random
from copy import deepcopy

samples = deepcopy(categories)
for category in categories:
    for cls in sorted(categories[category].keys()):
        cls_tot = 0
        years_sorted = sorted(categories[category][cls].keys(), key=lambda x: len(categories[category][cls][x]))
        for i, year in enumerate(years_sorted):
            sample_count = (20 - cls_tot) // (len(years_sorted) - i)
            category_urls = categories[category][cls][year]
            if len(category_urls) < sample_count:
                samples[category][cls][year] = category_urls
            else:
                samples[category][cls][year] = random.sample(sorted(category_urls), k=sample_count)
            cls_tot += len(samples[category][cls][year])
        print(category, cls, cls_tot)

In [None]:
save_dir = "CategorySamplesNew/"
for category in samples:
  # for lang_class in samples[category]:
    for year in samples[category][lang_class]:
      pdf_dir = os.path.join(save_dir, category, str(lang_class), str(year))
      for url in samples[category][lang_class][year]:
        url += "" if url.endswith(".pdf") else ".pdf"
        if not os.path.exists(pdf_dir):
          os.makedirs(pdf_dir, exist_ok=True)
        try:
          urlretrieve(url, os.path.join(pdf_dir, os.path.basename(url)))
        except Exception as e:
          print(f"{category}, {lang_class}, {year}, {url}", e)

In [None]:
from glob import glob

txts = set(map(lambda x: os.path.basename(x), glob("TXTs2/*/*.txt") + glob("TXTs3/*/*.txt")))

seen = set()
not_seen = set()

for category in samples:
  for lang_class in samples[category]:
    for year in samples[category][lang_class]:
      pdf_dir = os.path.join(save_dir, category, str(lang_class), str(year))
      for url in samples[category][lang_class][year]:
        url += "" if url.endswith(".pdf") else ".pdf"
        url = url.replace(".pdf", ".txt")
        if os.path.basename(url) in txts:
            seen.add(os.path.basename(url))
        else:
            not_seen.add(os.path.basename(url))

In [None]:
len(seen), len(not_seen)

In [None]:
with open("in_filtered.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(sorted(seen)))
with open("not_in_filtered.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(sorted(not_seen)))

In [None]:
import shutil
pdfs = glob("PDFs1/*/*.pdf") + glob("PDFs2/*/*.pdf")
for path in pdfs:
    new_path = path.replace("PDFs1", "PDFs").replace("PDFs2", "PDFs")
    dirname = os.path.dirname(new_path)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    shutil.copy(path, new_path)

In [None]:
from glob import glob
from tqdm import tqdm
import os
import shutil

pdf_paths = glob("CategorySamplesNew/*/*/*.pdf")

for pdf_path in tqdm(pdf_paths):
    dst_path = pdf_path.replace(".pdf", ".txt")
    txt_basename = os.path.basename(dst_path)
    txt_paths = glob(f"TXTs/*/{txt_basename}")
    if txt_paths:
        txt_path = txt_paths[0]
        shutil.copy(txt_path, dst_path)
    