In [1]:
### Data dictionaries

# donations_df data dictionary
# Variable                    Type       Description
# Grantmaker_name             String     Corporation/foundation that gave grant
# Year                        Int        Year grant was given
# Recipient_name              String     Organization that recived grant
# NTEE_code                   String     NTEE code of organization given grant
# NTEE_category               String     Broader category of organization according to IRS
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars
# Recipient_city              String     City of recipient organization
# Recipient_state             String     State of recipient organization


# lasso_df (1) data dictionary
# Variable                    Type       Description
# Individualism               Int        Measure of prevalence of this discourse of delay (DoD) in the text
# The 'free rider' excuse     Int        Measure of prevalence of this DoD in the text in given year
# Whataboutism                Int        Measure of prevalence of this DoD in the text in given year
# All talk, little action     Int        Measure of prevalence of this DoD in the text in given year
# Fossil fuel solutionism     Int        Measure of prevalence of this DoD in the text in given year
# No sticks, just carrots     Int        Measure of prevalence of this DoD in the text in given year
# Technological optimism      Int        Measure of prevalence of this DoD in the text in given year
# Appeal to well-being        Int        Measure of prevalence of this DoD in the text in given year
# Policy perfectionism        Int        Measure of prevalence of this DoD in the text in given year
# Appeal to social justice    Int        Measure of prevalence of this DoD in the text in given year
# Change is impossible        Int        Measure of prevalence of this DoD in the text in given year
# Doomism                     Int        Measure of prevalence of this DoD in the text in given year
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars


# lasso_df (2) data dictionary
# Variable                    Type       Description
# Word w                      Int        Measure of prevalence of word w in the text (all DoD words)
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars


In [2]:
### Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.api import OLS

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import csv
import nltk
import statistics
from nltk.corpus import stopwords # Importing stop words (e.g., the, and, a, of, etc.)
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/finnianlowden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
### Importing data

# (1)
# Corporate giving dataset - local
complete_donations_df = pd.read_excel("Oil_corporations_NTEE_Data_MASTER_SHEET.xlsx", sheet_name = "Individual_donations")

# Dropping irrelevant columns (those not in data dictionary) from corporate philanthropy dataframe
donations_df = complete_donations_df[["grantmaker_name", "year", "recipient_name", "NTEE_code",
                                      "NTEE_category", "Grant Amount (2020 Dollars)",
                                      "recipient_city", "recipient_state"]]

# Renaming Grant Amount (2020 Dollars) to not include spaces & converting to int
donations_df = donations_df.rename(columns = {"Grant Amount (2020 Dollars)": "grant_amount"})
donations_df["grant_amount"] = donations_df["grant_amount"]


# (2)
# Text data - local
text_df = pd.read_excel("ENVS_documents_for_text_analysis.xlsx")
text_df.drop(columns={'Researcher', 'Word_counts'}, inplace = True)


# (3)
# Discourses of Delay - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

delay_df = pd.read_csv(spreadsheet_url, header=0)

simple_delay_names = {"Individualism": "Individualism", "Whataboutism":
             "Whataboutism", "Doomism": "Doomism",
             "The 'free rider' excuse": 'Free_rider',
             "All talk, little action": 'Talk_no_action',
             "Fossil fuel solutionism": 'FF_solutionism',
             "No sticks, just carrots": 'Carrots',
             "Technological optimism": 'Tech_optimism',
             "Appeal to well-being": 'Well_being',
             "Policy perfectionism": 'Perfect_policy',
             "Appeal to social justice": 'Social_justice',
             "Change is impossible": 'Change_impossible'}

# Converting dataframe to dictionary with DoD words in list format
complete_discourse_dict = {}
for row in delay_df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    complete_discourse_dict[simple_delay_names[delay_method]] = dict_words

# Making copy of complete_text_df
discourse_dict = complete_discourse_dict.copy()

# (4)
# Importing annualized_donations - local
annualized_donations_df = pd.read_excel('annualized_donations.xlsx')


In [4]:
### Data wrangling

# (1)
# Text cleaning
# Importing punctuation and stopwords
stopWords = set(stopwords.words('english'))
table_punctuation = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’') 

# Cleaning text data
textCleaned = [] # Creating an empty list to store list of cleaned words
for row in text_df["Document_text"]: # Looping through each Tweet in ukraineRussia_df
    rowCleaned = [] # Creating an empty list to store cleaned words from each Tweet
    row_as_list = str(row).split() # Splitting row into a list of words at ' '
    for word in row_as_list: # Looping through each word in row_as_list
        if word not in stopWords and word != "nan":
            text = word.translate(table_punctuation) # Translating punctuation into ''
            textLower = text.lower() # Converting text to lowercase
            rowCleaned.append(textLower) # Appending cleaned word to rowCleaned list
    textCleaned.append(rowCleaned)  # Appending rowCleaned to textCleaned list
text_df["cleaned_text"] = textCleaned


# (2)
# Creating document-term matrix
# Getting list of Discourses of Delay
best_dicts = ["FF_solutionism", "Well_being", "Social_justice", "Carrots"]
top6_dicts = ["FF_solutionism", "Well_being", "Social_justice", "Carrots", "Free_rider", "Whataboutism"]
all_dicts = list(discourse_dict)
delay_types = all_dicts

# Getting all words in DoD dictionaries
delay_vocabulary = set()
for delay in delay_types:
    delay_vocabulary.update(discourse_dict[delay])

# Creating copy of text_df to work with
regression_df = text_df.copy()
    
# Creating DoD_results dict
DoD_results = {}

# Looping through each DoD word
for col in delay_vocabulary:
    wordAppearance = []
    # Looping through each entry in text_df["cleaned_text"]
    for text in text_df["cleaned_text"]:
        mySum = 0
        prevWord = ""
        for word in text:
            # Incrementing if word in DoD vocabulary
            if word == col:
                mySum += 1
            bigram = prevWord + " " + word
            # Incrementing if bigram in DoD vocabulary
            if bigram == col:
                mySum += 1
            # Creating DoD results dict
            if word == col or bigram == col:
                for delay in discourse_dict:
                    og_words = [x.lower() for x in discourse_dict[delay]]
                    if word in og_words:                
                        if delay not in set(DoD_results):
                            DoD_results[delay] = {word}
                        else:
                            DoD_results[delay].add(word)
                    if bigram in og_words:                
                        if delay not in set(DoD_results):
                            DoD_results[delay] = {bigram}
                        else:
                            DoD_results[delay].add(bigram)    
            prevWord = word  
        wordAppearance.append(mySum)
    # Adding word to regression_df if it appears in corpus
    if (sum(wordAppearance) > 0):
        regression_df[col] = wordAppearance
        regression_df = regression_df.copy()

# Creating dict of words with their associated dictionaries      
word_to_DoD = {}
for delay in DoD_results:
    wordSet = DoD_results[delay]
    for word in wordSet:
        if word not in word_to_DoD:
            word_to_DoD[word] = {delay}
        else:
            word_to_DoD[word].add(delay)

# (3)
# Adding donation information to regression_df
annualized_donations_df = annualized_donations_df.rename(
    columns = {"recipient_name": "Organization_name", "year": "Document_year"}) # Renaming group column
regression_df["Organization_name"] = regression_df["Organization_name"].str.lower() # Converting name to lowercase
regression_df = pd.merge(regression_df, annualized_donations_df, on = [
    'Organization_name', 'Document_year'], how = 'outer') # Mergining on organization name and document year
regression_df["grant_amount"] = regression_df["grant_amount"].fillna(0)


# (4)
# Computing word counts for each entry
wordCounts = []
for entry in regression_df["cleaned_text"]:
    wordCounts.append(len(str(entry).split()))
regression_df["Word_counts"] = wordCounts


In [6]:
### Regression with dictionary counts as independent variables

# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Formatting data for regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_text'})

# Converting intensity to frequency measures
for col in list(lasso_df):
    lasso_df[col] = lasso_df[col] / wordCounts

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Combining word counts into dictionary counts
dict_sums = {} # Creating dict to store results

# Removing words from discourse_dict
wordsInText = list(lasso_df.drop(columns = {'grant_amount'}))
discourse_dict_reg = discourse_dict.copy()

for delay in discourse_dict_reg:
    tempList = discourse_dict_reg[delay].copy() # Creating copy to not modify original list
    for word in discourse_dict_reg[delay]:
        if word not in wordsInText:
            tempList.remove(word)
    discourse_dict_reg[delay] = tempList # Adjusting pointer to copy
    dict_sums[delay] = list(pd.DataFrame.sum(X[discourse_dict_reg[delay]], axis=1))
    
# Converting dictionary to DataFrame
X = pd.DataFrame.from_dict(dict_sums)

# Lasso Regression (ends up being the same as a linear regression)
# # Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# # Creating variables to store alphas
# dictErrs = {}
# cv_errs = []
# coefs = []

# # Running a LASSO regression here
# potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
# for alpha in potentialAlphas:
#     lassoReg = Lasso(alpha = alpha, normalize = True)
#     lassoReg.fit(X_train, y_train)
#     y_pred = lassoReg.predict(X_test)
#     cv_err = np.mean((y_pred - y_test)**2)
#     cv_errs.append(cv_err)
#     coefs.append(lassoReg.coef_)
#     dictErrs[cv_err] = alpha

# # Running LASSO with optimal alpha
# lassoReg = Lasso(alpha = dictErrs[min(cv_errs)],
#                  normalize = True) # running LASSO with best alpha (when 0 -> OLS regression)
# lassoReg = lassoReg.fit(X_train, y_train)

# # Printing resuls from LASSOreg
# lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
# lasso_coef.sort_values(by='val_lasso', ascending=False)

# Creating linear regression
olsReg = LinearRegression()
olsReg = olsReg.fit(X, y)

# Printing resuls from olsReg
ols_coef = pd.DataFrame({'var':X.columns, 'val_ols':olsReg.coef_})
ols_coef.sort_values(by='val_ols', ascending=False)


Unnamed: 0,var,val_ols
6,Tech_optimism,85023.242408
0,Individualism,11042.172551
3,Talk_no_action,3216.137578
1,Free_rider,-40609.990091
2,Whataboutism,-44315.521828
7,Well_being,-47608.25816
4,FF_solutionism,-50981.46956
11,Doomism,-52389.154298
5,Carrots,-62002.457146
10,Change_impossible,-78427.801659


In [7]:
### Lasso reg with words as independent variables

# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Formatting data for Lasso regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_year', 'Document_text'})

# Converting intensity to frequency measures
for col in list(lasso_df):
    lasso_df[col] = lasso_df[col] / wordCounts

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# Creating variables to store alphas
dictErrs = {}
cv_errs = []
coefs = []

# Running a LASSO regression here
potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
for alpha in potentialAlphas:
    lassoReg = Lasso(alpha = alpha, normalize = True)
    lassoReg.fit(X_train, y_train)
    y_pred = lassoReg.predict(X_test)
    cv_err = np.mean((y_pred - y_test)**2)
    cv_errs.append(cv_err)
    coefs.append(lassoReg.coef_)
    dictErrs[cv_err] = alpha

# Running LASSO with optimal alpha
lassoReg = Lasso(alpha = dictErrs[min(cv_errs)], normalize = True) # running LASSO with best alpha
lassoReg = lassoReg.fit(X_train, y_train)

# Printing resuls from LASSOreg
lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
dict_list = []
for word in X.columns:
    dict_list.append(str(word_to_DoD[word]).replace("{", "").replace("}", "").replace("'", ""))
lasso_coef['associated_dicts'] = dict_list
lasso_coef.sort_values(by='val_lasso', ascending=False)


Unnamed: 0,var,val_lasso,associated_dicts
97,commitments,286647.093938,Talk_no_action
55,horizon,265381.309986,Tech_optimism
7,near future,88276.683161,Tech_optimism
47,investment,58699.936016,Tech_optimism
18,lost,50324.493124,Well_being
...,...,...,...
33,fate,-52038.246408,Doomism
88,promised,-62772.375796,Talk_no_action
66,impossible,-68368.789865,Doomism
20,threat,-68904.588314,Well_being


In [8]:
### Pulling out text excerpts with most delay rhetoric

temp_df = lasso_coef[lasso_coef['val_lasso'] > 0].sort_values(by='val_lasso', ascending=False)

for word in list(temp_df['var']):
    print(word)
    for row in regression_df['Document_text']:
        delay_sentences = row.split(".")
        for sentence in delay_sentences:
            if word in sentence:
                 print("\n" + sentence)
    print("####################################################################")


commitments

 • Negotiated commitments from private landowners adjacent to Red Rocks Lakes NWR, to develop conservation strategies for their lands

 Several important triumphs marked the year, including: • Launching the Los Angeles Nature Center project by creating an expansive coalition of supporters and by securing early and significant financial commitments from the Packard Foundation, Irvine Foundation, and the state

 These plans allow the "take" of threatened or endangered species in return for commitments to long-term habitat conservation

 Many leading companies doubled down on their commitments to carbon neutrality and sustainability, showing a remarkable willingness to lead

 And we worked with governments so they could further their commitments to protecting the health of their nations and their people

 Caring for the Earth is a unifying platform that has the power to activate young people, transform the behavior of companies and activate the commitments of governments

To 

AttributeError: 'float' object has no attribute 'split'

In [9]:
list(temp_df['var'])

['commitments', 'horizon', 'near future', 'investment', 'lost']

In [11]:
### Robustness checks

# (1)
# Removing all positive coefficient dicts to see if signs change on others -- they do not!

# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Formatting data for regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_text'})

# Converting intensity to frequency measures
for col in list(lasso_df):
    lasso_df[col] = lasso_df[col] / wordCounts

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Combining word counts into dictionary counts
dict_sums = {} # Creating dict to store results

# Removing words from discourse_dict
wordsInText = list(lasso_df.drop(columns = {'grant_amount'}))
discourse_dict_reg = discourse_dict.copy()

for delay in discourse_dict_reg:
    tempList = discourse_dict_reg[delay].copy() # Creating copy to not modify original list
    for word in discourse_dict_reg[delay]:
        if word not in wordsInText:
            tempList.remove(word)
    discourse_dict_reg[delay] = tempList # Adjusting pointer to copy
    dict_sums[delay] = list(pd.DataFrame.sum(X[discourse_dict_reg[delay]], axis=1))
    
# Converting dictionary to DataFrame & dropping all positive columns
X = pd.DataFrame.from_dict(dict_sums).drop(columns={"Tech_optimism", "Individualism", "Talk_no_action"})

# Creating linear regression
olsReg = LinearRegression()
olsReg = olsReg.fit(X, y)

# Printing resuls from olsReg
ols_coef = pd.DataFrame({'var':X.columns, 'val_ols':olsReg.coef_})
ols_coef.sort_values(by='val_ols', ascending=False)


Unnamed: 0,var,val_ols
1,Whataboutism,-35154.827099
0,Free_rider,-35761.242755
4,Well_being,-49023.99034
2,FF_solutionism,-51170.305826
8,Doomism,-52275.566931
3,Carrots,-62293.048372
7,Change_impossible,-75920.461365
6,Social_justice,-87322.590603
5,Perfect_policy,-101470.785556
