# Climate Stems Creation


## Setup


In [2]:
import ssl
import fitz
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd

DATA_PATH = 'data/'
DIST_PATH = 'dist/'

# Ignore SSL certificate errors
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('stopwords')
nltk.download('punkt')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Extracting climate terms and their definitions from the IPCC Sixth Assessment Report Glossary


### Extracting the text from the IPCC Sixth Assessment Report Glossary PDF


In [4]:
glossary_path = DATA_PATH + 'IPCC Sixth Assessment Report Glossary.pdf'
doc = fitz.open(glossary_path)

# Remomving the first and final three pages of the glossary
glossary_text = [page.get_text() for page in doc]
glossary_text.pop(0)
glossary_text = glossary_text[:-3]

# Concatenating the glossary text
glossary_text_string = ' '.join(glossary_text)

# Removing superfluous text
pattern = r"(Approval Session|Glossary|IPCC SR1\.5|Do Not Cite, Quote or Distribute|Total pages: \d+|See [A-Za-z]+\.|1-\d+)"
cleaned_glossary_text = re.sub(pattern, '', glossary_text_string).strip()

### Creating a rough dictionary of climate terms and their definitions


In [5]:
chunks = re.split(r'\s{5,}', cleaned_glossary_text)

terms = []
definitions = []

for chunk in chunks:
    split_chunk = re.split(r'\s{2,}', chunk)
    term = split_chunk[0]
    definition = ' '.join(split_chunk[1:])

    terms.append(term)
    definitions.append(definition)

climate_dictionary = pd.DataFrame({'term': terms, 'definition': definitions})
climate_dictionary = climate_dictionary.drop_duplicates(
    subset='term', keep='first')

climate_dictionary.to_csv(
    DATA_PATH + 'rough_climate_dictionary.csv', index=False)

The dictionary is manually cleaned at this point to create the [`cleaned_climate_dictionary.csv` file](https://docs.google.com/spreadsheets/d/1a1rvYR6gQWmUY9fYlmm2lxK9xqiRB0I6LqsUNNZ6eUw/edit#gid=1267656563).


## Turning the climate terms into a dictionary of stemmed unique unigrams


### Preprocessing function for the climate terms


In [6]:
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()


def clean_tokenize(text):
    # Text should almost always be a string, but we check
    # just in case
    if not isinstance(text, str):
        text = str(text)

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopwords]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return stemmed_tokens

### Tokenizing and stemming the climate terms


In [7]:
cleaned_climate_dictionary = pd.read_csv(
    DATA_PATH + 'cleaned_climate_dictionary.csv')

cleaned_climate_dictionary['cleaned_term'] = cleaned_climate_dictionary['term'].apply(
    clean_tokenize)
exploded_climate_terms_df = cleaned_climate_dictionary.explode('cleaned_term')
unique_climate_terms = exploded_climate_terms_df['cleaned_term'].unique()
unique_climate_terms_df = pd.DataFrame(unique_climate_terms, columns=[
                                       'stem']).sort_values(by='stem').reset_index(drop=True)

unique_climate_terms_df.to_csv(
    DIST_PATH + 'climate_stems.csv', index=False)