##Towards Climate Awareness in NLP Research

Literature survey notebook

### Part I: Fetch papers

Only run this part if needed!

In [None]:
!git clone https://github.com/coastalcph/acl-citations
!cd acl-citations/ && pip install -r requirements.txt
!apt-get install xpdf

Pattern for before 2020:

```{C,D,E,N,P}{16,17,18,19}-```

Pattern for 2020 onwards:

```202?.{acl,emnlp,coling,naacl,aacl,eacl,findings,sustainlp,nlp4posimpact}-```

In [None]:
!cd acl-citations/bin && python acl_anthology.py fetch 202?.{acl,emnlp,coling,naacl,aacl,eacl,findings,sustainlp,nlp4posimpact}-

Convert pdf files to txt files

In [None]:
!find acl-citations/bin/pdf/ -name '*.pdf' -exec pdftotext -raw "{}" "{}.txt" \;

Zip or unzip text files

In [None]:
!zip papers.zip acl-citations/bin/pdf/*/*.txt

### Part II: Analysis

In [None]:
!pip install --upgrade --no-cache-dir gdown

Import relevant packages

In [None]:
import glob
import re
from collections import defaultdict, Counter

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Download papers from Google Drive and unzip them

In [None]:
!gdown #Insert Google Drive ID here (optional)
!unzip papers.zip

Read text files

In [None]:
paper_dict = {}
number_of_papers = len(glob.glob("acl-citations/bin/pdf/*/*.txt"))
print("Number of papers: " + str(number_of_papers))

for filename in glob.glob("acl-citations/bin/pdf/*/*.txt"):
  with open(filename) as file:
    # Remove path from filename and read text
    paper_dict[filename[26:-8]] = " ".join([line.rstrip("\n") for line in file])
    # Merge words
    paper_dict[filename[26:-8]] = re.sub("- ", "", paper_dict[filename[26:-8]]).strip(" ")

Define regex patterns

In [None]:
pattern_dict = {
    "public": r"(((model|weight) (will be|is)?|(models|weights) (will be|are)?) (public|available|upload|made available|made public|provided (at|under|on)))|((publish|upload) [a-zA-Z0-9, ]{0,20}(model(s)?|weight(s)?))|(make [a-zA-Z0-9, ]{0,20}(model(s)?|weight(s)?) (available|public))|(provide [a-zA-Z0-9, ]{0,20}(model(s)?|weight(s)?) (at|under|on))",
    "duration": r"(((pre(-)?)?train(ing|ed)?|optimize|optimization|(fine(-)?)?tun(e|ed|ing)) ([a-zA-Z0-9, ]{0,20})(for|took|take(s)?) ([a-zA-Z0-9, ]{0,20})(seconds|minute|hour|day|week|month)+)|hours of computation",
    "energy": r"(energy|power|electricity) (consumption|usage)|(is|of|at) [1-9]{1}[0-9]{2,5} (watt(s)?|(k)?w) | pue ",
    "location": r"((data ?center|(a|the) cloud|(virtual|gpu) machine|computer cluster|hpc) (is )?(at|in) )|(cloud|azure|google|aws)([a-zA-Z0-9, ]{0,20})region",
    "emission": r"(co2(e|eq)?|ghg|carbon) (footprint|emission(s)?|emitted|offset(ting)?) "
}

In [None]:
def get_counts(paper_dict, pattern, print_matches=False):
  """Get counts of papers in paper_dict matching pattern"""
  counter = defaultdict(Counter)

  for article, text in paper_dict.items():
    # If the paper is deep-learning-related
    if (re.search("model", text, re.IGNORECASE) is not None 
        and re.search(r"deep learning|neural network|lstm|recurrent neural network|rnn|transformer|mlp|convolutional neural network|cnn|gpt", text, re.IGNORECASE) is not None):
      
      # Get publication year
      year = "20" + article[1:3]if article[0].isalpha() else article[:4]
      
      # Update counter for general
      counter[year].update(["general"])

      # Search for pattern
      match = re.search(pattern, text, re.IGNORECASE)
      
      if match is not None:
        if print_matches:
          print(article + ": " + match[0])

        # Update counter for pattern
        counter[year].update(["pattern"])

  return counter

Apply function for all patterns

In [None]:
results = defaultdict(dict)

for pattern in pattern_dict.keys():
  counter = get_counts(paper_dict, pattern_dict[pattern])

  for key in counter.keys():
    results[pattern][key] = counter[key]["pattern"] / counter[key]["general"]

Convert to DataFrame

In [None]:
plot_data = pd.DataFrame.from_dict(results).sort_index()
plot_data = round(plot_data * 100, 2)

Plot figure

In [None]:
plt.figure(figsize=(8,5))

p = sns.lineplot(data=plot_data)

p.set_xlabel("Year", fontsize=12)
p.set_ylabel("% of deep-learning-related papers", fontsize=12)
p.tick_params(labelsize=12)
p.legend(fontsize=12)

p.set_yticklabels(['{:,.1f}'.format(x) for x in p.get_yticks()])

sns.despine(left=True, bottom=True, right=True)

plt.savefig('survey_proportions.png', dpi=300, bbox_inches='tight')