# Climate Dictionary Creation


This notebook creates a dictionary of climate terms which can be used to filter Hansard and the Congressional Record to create a climate corpus. The notebook uses the IPCC Sixth Assessment Report Glossary as the dictionary's basis.


## Setup


In [70]:
import fitz
import re
import pandas as pd

data_path = 'data/'
dist_path = 'dist/'

## Extracting climate terms and their definitions from the IPCC Sixth Assessment Report Glossary


### Extracting the text from the IPCC Sixth Assessment Report Glossary PDF


In [22]:
glossary_path = data_path + 'IPCC Sixth Assessment Report Glossary.pdf'
doc = fitz.open(glossary_path)

### Glossary cleaning


In [73]:
# Remomving the first and final three pages of the glossary
glossary_text = [page.get_text() for page in doc]
glossary_text.pop(0)
glossary_text = glossary_text[:-3]

# Concatenating the glossary text
glossary_text_string = ' '.join(glossary_text)

# Removing superfluous text
pattern = r"(Approval Session|Glossary|IPCC SR1\.5|Do Not Cite, Quote or Distribute|Total pages: \d+|See [A-Za-z]+\.|1-\d+)"
cleaned_glossary_text = re.sub(pattern, '', glossary_text_string).strip()

In [68]:
# Writing the cleaned glossary text to a file
with open(dist_path + 'glossary.txt', 'w') as f:
    f.write(cleaned_glossary_text)

### Creating a rough dataframe of climate terms and their definitions


In [85]:
chunks = re.split(r'\s\s\s\s\s+', cleaned_glossary_text)

terms = []
definitions = []

for chunk in chunks:
    term = re.split(r'\s\s', chunk)[0]
    definition = re.split(r'\s\s', chunk)[1:]
    definition = ' '.join(definition)

    terms.append(term)
    definitions.append(definition)

climate_dictionary = pd.DataFrame({'term': terms, 'definition': definitions})
climate_dictionary = climate_dictionary.drop_duplicates(
    subset='term', keep='first')

climate_dictionary.to_csv(
    dist_path + 'raw_climate_dictionary.csv', index=False)