## Libraries

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
import os
from collections import Counter

### Predefined the Bigrams

In [2]:
# Define the predefined bigrams related to climate change (C₀)
predefined_bigrams_C0 = [
    ("air", "pollution"),
    ("air", "quality"),
    ("air", "temperature"),
    ("biomass", "energy"),
    ("carbon", "dioxide"),
    ("carbon", "emission"),
    ("carbon", "energy"),
    ("carbon", "neutral"),
    ("carbon", "price"),
    ("carbon", "sink"),
    ("carbon", "tax"),
    ("clean", "air"),
    ("clean", "energy"),
    ("clean", "water"),
    ("climate", "change"),
    ("coastal", "area"),
    ("coastal", "region"),
    ("electric", "vehicle"),
    ("energy", "climate"),
    ("energy", "conversion"),
    ("energy", "efficient"),
    ("energy", "environment"),
    ("environmental", "sustainability"),
    ("extreme", "weather"),
    ("flue", "gas"),
    ("forest", "land"),
    ("gas", "emission"),
    ("ghg", "emission"),
    ("global", "decarbonization"),
    ("global", "warm"),
    ("greenhouse", "gas"),
    ("heat", "power"),
    ("Kyoto", "protocol"),
    ("natural", "hazard"),
    ("new", "energy"),
    ("ozone", "layer"),
    ("renewable", "energy"),
    ("sea", "level"),
    ("sea", "water"),
    ("snow", "ice"),
    ("solar", "energy"),
    ("solar", "thermal"),
    ("sustainable", "energy"),
    ("water", "resource"),
    ("water", "resources"),
    ("wave", "energy"),
    ("weather", "climate"),
    ("wind", "energy"),
    ("wind", "power"),
    ("wind", "resource"),
    
    # Bigrams from Table IA. IV - Panel A (Opportunity Bigrams)
    ("heat", "power"),
    ("new", "energy"),
    ("plug", "hybrid"),
    ("rooftop", "solar"),
    ("renewable", "electricity"),
    ("renewable", "energy"),
    ("wind", "power"),
    ("renewable", "resource"),
    ("solar", "farm"),
    ("sustainable", "energy"),
    ("electric", "vehicle"),
    ("wind", "energy"),
    ("solar", "energy"),
    ("hybrid", "car"),
    ("clean", "energy"),
    ("electric", "hybrid"),
    ("geothermal", "power"),
    
    # Bigrams from Table IA. IV - Panel B (Regulatory Bigrams)
    ("greenhouse", "gas"),
    ("gas", "emission"),
    ("carbon", "tax"),
    ("emission", "trade"),
    ("carbon", "reduction"),
    ("reduce", "emission"),
    ("air", "pollution"),
    ("carbon", "price"),
    ("dioxide", "emission"),
    ("carbon", "market"),
    ("carbon", "emission"),
    ("reduce", "carbon"),
    ("environmental", "standard"),
    ("epa", "regulation"),
    ("mercury", "emission"),
    ("carbon", "dioxide"),
    ("energy", "regulatory"),
    ("nox", "emission"),
    ("energy", "independence"),
    
    # Bigrams from Table IA. IV - Panel C (Physical Bigrams)
    ("coastal", "area"),
    ("forest", "land"),
    ("storm", "water"),
    ("natural", "hazard"),
    ("water", "discharge"),
    ("global", "warm"),
    ("sea", "level"),
    ("heavy", "snow"),
    ("sea", "water"),
    ("ice", "product"),
    ("snow", "ice"),
    ("nickel", "metal"),
    ("air", "water"),
    ("warm", "climate")
]

### Loading IPCC Dataset

In [3]:
import os

# Directory containing the IPCC report text files
directory = r'C:\Users\fariz\Documents\Graduate Data\Climate Change\Fix Data\IPCC\IPCC\raw_txt\IPCC'

# Initialize an empty string to hold all text
ipcc_text = ""

# Loop through each file in the directory and read the text
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            ipcc_text += file.read() + "\n"

print("Loaded all IPCC reports into a single text string.")


Loaded all IPCC reports into a single text string.


In [4]:
print(ipcc_text[:500])

Summary for  
Policymakers
SPM
3
Summary 
for Policymakers
Drafting Authors: 
Nerilie Abram (Australia), Carolina Adler (Switzerland/Australia), Nathaniel L. Bindoff (Australia), 
Lijing Cheng (China), So-Min Cheong (Republic of Korea), William  W.  L. Cheung (Canada), 
Matthew Collins (UK), Chris Derksen (Canada), Alexey Ekaykin (Russian Federation), Thomas 
Frölicher (Switzerland), Matthias Garschagen (Germany), Jean-Pierre Gattuso (France), Bruce 
Glavovic (New Zealand), Stephan Gruber (Canad


## lemmatize and stem the textual IPCC data, removing digits, punctuation, and stop words

In [5]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Tokenize the text into words
tokens = word_tokenize(ipcc_text.lower())

# Lemmatize, stem, and clean the tokens
cleaned_words = [
    stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens 
    if word not in stopwords.words('english') and word not in string.punctuation and not word.isdigit()
]

print("Text preprocessing completed.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fariz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fariz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fariz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text preprocessing completed.


#### Filtering Bigrams 

In [6]:
## Library Bigrams
from nltk import bigrams
from collections import Counter

In [7]:
# Generate bigrams
ipcc_bigrams = list(bigrams(cleaned_words))

# Count the frequency of each bigram
bigram_freq = Counter(ipcc_bigrams)

# Filter bigrams with a frequency higher than 10
filtered_bigrams = {bigram: count for bigram, count in bigram_freq.items() if count > 10}

print(f"Filtered Bigrams: {len(filtered_bigrams)} bigrams with frequency > 10")

Filtered Bigrams: 181168 bigrams with frequency > 10


### Preprocessing the Non-climate-change texts

In [8]:
# Set the directory where your BBC data is stored
directory = r'C:\Users\fariz\Documents\Graduate Data\Climate Change\Model\bbc-fulltext\bbc'

# Initialize an empty string to hold all non-climate text
non_climate_text = ""

# Loop through each subdirectory (business, entertainment, politics, etc.)
for subdir in os.listdir(directory):
    subdir_path = os.path.join(directory, subdir)
    if os.path.isdir(subdir_path):  # Check if it is a directory
        # Loop through each text file in the subdirectory
        for filename in os.listdir(subdir_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(subdir_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    non_climate_text += file.read() + "\n"

print("Loaded all non-climate text into a single string.")

# Preprocess the non-climate text
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Tokenize and clean the non-climate-change text
non_climate_tokens = word_tokenize(non_climate_text.lower())

# Lemmatize, stem, and clean the tokens
cleaned_non_climate_words = [
    stemmer.stem(lemmatizer.lemmatize(word)) for word in non_climate_tokens 
    if word not in stopwords.words('english') and word not in string.punctuation and not word.isdigit()
]

# Generate bigrams for non-climate-change text
non_climate_bigrams = list(nltk.bigrams(cleaned_non_climate_words))

# Count the frequency of each bigram
non_climate_bigram_freq = Counter(non_climate_bigrams)

print(f"Generated {len(non_climate_bigrams)} bigrams for non-climate-change text.")

Loaded all non-climate text into a single string.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fariz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fariz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fariz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Generated 517633 bigrams for non-climate-change text.


### Identify the bigrams that contaminated by unrelated climate change topics

In [9]:
# Identify bigrams that are in Cᵀ (climate change set) but not in N (non-climate change set)
unique_climate_bigrams = {bigram: count for bigram, count in filtered_bigrams.items() if bigram not in non_climate_bigram_freq}

print(f"Identified {len(unique_climate_bigrams)} unique climate-change bigrams.")

Identified 166328 unique climate-change bigrams.


### Create bigram set M

In [10]:
ipcc_sentences = nltk.sent_tokenize(ipcc_text)

In [None]:
# Filter sentences to create set M
M = [sentence for sentence in ipcc_sentences if any(bigram in list(bigrams(sentence.split())) for bigram in unique_climate_bigrams)]

print(f"Filtered set M contains {len(M)} sentences likely discussing climate change.")

### Reference and search set

In [None]:
# Define the reference set R
R = [sentence for sentence in M if any(bigram in list(bigrams(sentence.split())) for bigram in predefined_bigrams_C0)]

print(f"Reference set R contains {len(R)} sentences.")

In [None]:
## set S
S = [sentence for sentence in M if sentence not in R]

### Model Training

In [None]:
# Library
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Training Data Preparation

In [None]:
# Sample from S to create balanced training data
sampled_S = random.sample(S, len(R))

# Combine R and sampled S into a training set
training_sentences = R + sampled_S
labels = [1] * len(R) + [0] * len(sampled_S)

# Convert text data into numerical features
vectorizer = CountVectorizer(ngram_range=(1, 2))  # Use unigrams and bigrams
X = vectorizer.fit_transform(training_sentences)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [None]:
# Train classifiers
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)

# SVM
svm = SVC(probability=True)
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# Evaluate models
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))

In [None]:
### Classification and prediction
# Predict probabilities for search set S
S_vectorized = vectorizer.transform(S)
nb_probs = nb.predict_proba(S_vectorized)[:, 1]  # Probability of belonging to R
svm_probs = svm.decision_function(S_vectorized)  # SVM decision function
rf_probs = rf.predict_proba(S_vectorized)[:, 1]  # Random Forest probability

# Combine predictions and create target set T
threshold = 0.8
T = [S[i] for i in range(len(S)) if nb_probs[i] > threshold or svm_probs[i] > threshold or rf_probs[i] > threshold]

print(f"Target set T contains {len(T)} sentences likely discussing climate change.")

### Final Bigrams

In [None]:
# Extract and compare bigrams from T and S \ T
T_bigrams = Counter([bigram for sentence in T for bigram in list(bigrams(preprocess(sentence)))])
S_minus_T_bigrams = Counter([bigram for sentence in (set(S) - set(T)) for bigram in list(bigrams(preprocess(sentence)))])

# Discriminative bigrams: more frequent in T than S \ T
discriminative_bigrams = {bigram: count for bigram, count in T_bigrams.items() if count > S_minus_T_bigrams.get(bigram, 0)}

# Rank and finalize the bigram set
final_bigrams = sorted(discriminative_bigrams, key=discriminative_bigrams.get, reverse=True)

print(f"Final bigram library contains {len(final_bigrams)} bigrams.")