In [141]:
### Data dictionaries

# donations_df data dictionary
# Variable                    Type       Description
# Grantmaker_name             String     Corporation/foundation that gave grant
# Year                        Int        Year grant was given
# Recipient_name              String     Organization that recived grant
# NTEE_code                   String     NTEE code of organization given grant
# NTEE_category               String     Broader category of organization according to IRS
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars
# Recipient_city              String     City of recipient organization
# Recipient_state             String     State of recipient organization


# text_df data dictionary
# Variable                    Type       Description
# Group                       String     Name of environmental nonprofit
# Individualism               Int        Measure of prevalence of this discourse of delay (DoD) in the text
# The 'free rider' excuse     Int        Measure of prevalence of this DoD in the text in given year
# Whataboutism                Int        Measure of prevalence of this DoD in the text in given year
# All talk, little action     Int        Measure of prevalence of this DoD in the text in given year
# Fossil fuel solutionism     Int        Measure of prevalence of this DoD in the text in given year
# No sticks, just carrots     Int        Measure of prevalence of this DoD in the text in given year
# Technological optimism      Int        Measure of prevalence of this DoD in the text in given year
# Appeal to well-being        Int        Measure of prevalence of this DoD in the text in given year
# Policy perfectionism        Int        Measure of prevalence of this DoD in the text in given year
# Appeal to social justice    Int        Measure of prevalence of this DoD in the text in given year
# Change is impossible        Int        Measure of prevalence of this DoD in the text in given year
# Doomism                     Int        Measure of prevalence of this DoD in the text in given year
# Year                        Int        Year associated with prevalence measure


In [142]:
### Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import csv
import nltk
import statistics
from nltk.corpus import stopwords # Importing stop words (e.g., the, and, a, of, etc.)
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/finnianlowden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [143]:
### Importing data

# (1)
# Corporate giving dataset - local
complete_donations_df = pd.read_excel("Oil_corporations_NTEE_Data_MASTER_SHEET.xlsx", sheet_name = "Individual_donations")

# Dropping irrelevant columns (those not in data dictionary) from corporate philanthropy dataframe
donations_df = complete_donations_df[["grantmaker_name", "year", "recipient_name", "NTEE_code",
                                      "NTEE_category", "Grant Amount (2020 Dollars)",
                                      "recipient_city", "recipient_state"]]

# Renaming Grant Amount (2020 Dollars) to not include spaces & converting to int
donations_df = donations_df.rename(columns = {"Grant Amount (2020 Dollars)": "grant_amount"})
donations_df["grant_amount"] = donations_df["grant_amount"]

# (2)
# Text data
# Adding experiment text data - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1Zr4SQFxq8u3FnQwRyIHCoQ1Az_lb1PXd45Dhkni7Uok/edit#gid=570879331"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=") # Online
experiment_text_df = pd.read_csv(spreadsheet_url, header=0) # Online
# Adding control text data
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1Zr4SQFxq8u3FnQwRyIHCoQ1Az_lb1PXd45Dhkni7Uok/edit#gid=1393581184"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=") # Online
control_text_df = pd.read_csv(spreadsheet_url, header=0) # Online
# Joining control and experiment
complete_text_df = pd.concat([experiment_text_df, control_text_df])

# Making copy of complete_text_df
text_df = complete_text_df.copy()
text_df.drop(columns={'Researcher'}, inplace = True)


# (3)
# Discourses of Delay - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

delay_df = pd.read_csv(spreadsheet_url, header=0)

simple_delay_names = {"Individualism": "Individualism", "Whataboutism":
             "Whataboutism", "Doomism": "Doomism",
             "The 'free rider' excuse": 'Free_rider',
             "All talk, little action": 'Talk_no_action',
             "Fossil fuel solutionism": 'FF_solutionism',
             "No sticks, just carrots": 'Carrots',
             "Technological optimism": 'Tech_optimism',
             "Appeal to well-being": 'Well_being',
             "Policy perfectionism": 'Perfect_policy',
             "Appeal to social justice": 'Social_justice',
             "Change is impossible": 'Change_impossible'}

complete_discourse_dict = {}
for row in delay_df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    complete_discourse_dict[simple_delay_names[delay_method]] = dict_words

# Making copy of complete_text_df
discourse_dict = complete_discourse_dict.copy()

# (4)
# Importing annualized_donations
annualized_donations_df = pd.read_excel('annualized_donations.xlsx')


In [144]:
### Data wrangling

# (1)
# Text cleaning
# Importing punctuation and stopwords
stopWords = set(stopwords.words('english'))
table_punctuation = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’') 

# Cleaning text data
textCleaned = [] # Creating an empty list to store list of cleaned words
for row in text_df["Document_text"]: # Looping through each Tweet in ukraineRussia_df
    rowCleaned = [] # Creating an empty list to store cleaned words from each Tweet
    row_as_list = str(row).split() # Splitting row into a list of words at ' '
    for word in row_as_list: # Looping through each word in row_as_list
        if word not in stopWords and word != "nan":
            text = word.translate(table_punctuation) # Translating punctuation into ''
            textLower = text.lower() # Converting text to lowercase
            rowCleaned.append(textLower) # Appending cleaned word to rowCleaned list
    textCleaned.append(rowCleaned)  # Appending rowCleaned to textCleaned list

text_df["cleaned_text"] = textCleaned


# (2)
# Creating document-term matrix
# Getting list of Discourses of Delay
best_dicts = ["FF_solutionism", "Well_being", "Social_justice", "Carrots"]
top6_dicts = ["FF_solutionism", "Well_being", "Social_justice", "Carrots", "Free_rider", "Whataboutism"]
all_dicts = list(discourse_dict)
delay_types = all_dicts

# Getting all words in DoD dictionaries
delay_vocabulary = set()
for delay in delay_types:
    delay_vocabulary.update(discourse_dict[delay])
    
regression_df = text_df.copy()
    

# Creating DoD_results dict
DoD_results = {}
    
for col in delay_vocabulary:
    wordAppearance = []
    for text in text_df["cleaned_text"]:
        mySum = 0
        prevWord = ""
        for word in text:
            if word == col:
                mySum += 1
            bigram = prevWord + " " + word
            if bigram == col:
                mySum += 1
            # Creating DoD results dict
            if word == col or bigram == col:
                for delay in discourse_dict:
                    og_words = [x.lower() for x in discourse_dict[delay]]
                    if word in og_words:                
                        if delay not in set(DoD_results):
                            DoD_results[delay] = {word}
                        else:
                            DoD_results[delay].add(word)
                    if bigram in og_words:                
                        if delay not in set(DoD_results):
                            DoD_results[delay] = {bigram}
                        else:
                            DoD_results[delay].add(bigram)    
            prevWord = word  
        wordAppearance.append(mySum)
    if (sum(wordAppearance) > 0):
        regression_df[col] = wordAppearance
        regression_df = regression_df.copy()

# Creating dict of words with their associated dictionaries      
word_to_DoD = {}
for delay in DoD_results:
    wordSet = DoD_results[delay]
    for word in wordSet:
        if word not in word_to_DoD:
            word_to_DoD[word] = {delay}
        else:
            word_to_DoD[word].add(delay)

# (3)
# Adding donation information
# Adding donations to text_df
annualized_donations_df = annualized_donations_df.rename(
    columns = {"recipient_name": "Organization_name", "year": "Document_year"}) # Renaming group column
regression_df["Organization_name"] = regression_df["Organization_name"].str.lower()
annualized_donations_df['Document_year'] -= 0
regression_df = pd.merge(regression_df, annualized_donations_df, on = ['Organization_name', 'Document_year'], how = 'outer')
regression_df["grant_amount"] = regression_df["grant_amount"].fillna(0)


In [None]:
### Lasso reg with dictionary counts as independent variables

# Formatting data for Lasso regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_year', 'Document_text'})

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Combining word counts into dictionary counts
dict_sums = {} # Creating dict to store results

# Removing words from discourse_dict
wordsInText = list(lasso_df.drop(columns = {'grant_amount'}))
discourse_dict_reg = discourse_dict.copy()

for delay in discourse_dict_reg:
    tempList = discourse_dict_reg[delay].copy() # Creating copy to not modify original list
    for word in discourse_dict_reg[delay]:
        if word not in wordsInText:
            tempList.remove(word)
    discourse_dict_reg[delay] = tempList # Adjusting pointer to copy
    dict_sums[delay] = list(pd.DataFrame.sum(X[discourse_dict_reg[delay]], axis=1))
    
# Converting dictionary to DataFrame
X = pd.DataFrame.from_dict(dict_sums)

# Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# Creating variables to store alphas
dictErrs = {}
cv_errs = []
coefs = []

# Running a LASSO regression here
potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
for alpha in potentialAlphas:
    lassoReg = Lasso(alpha = alpha, normalize = True)
    lassoReg.fit(X_train, y_train)
    y_pred = lassoReg.predict(X_test)
    cv_err = np.mean((y_pred - y_test)**2)
    cv_errs.append(cv_err)
    coefs.append(lassoReg.coef_)
    dictErrs[cv_err] = alpha

# Running LASSO with optimal alpha
lassoReg = Lasso(alpha = dictErrs[min(cv_errs)], normalize = True) # running LASSO with best alpha
lassoReg = lassoReg.fit(X_train, y_train)

# Printing resuls from LASSOreg
lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
lasso_coef.sort_values(by='val_lasso', ascending=False)


In [155]:
### Lasso reg with words as independent variables

# Formatting data for Lasso regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_year', 'Document_text'})

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# Creating variables to store alphas
dictErrs = {}
cv_errs = []
coefs = []

# Running a LASSO regression here
potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
for alpha in potentialAlphas:
    lassoReg = Lasso(alpha = alpha, normalize = True)
    lassoReg.fit(X_train, y_train)
    y_pred = lassoReg.predict(X_test)
    cv_err = np.mean((y_pred - y_test)**2)
    cv_errs.append(cv_err)
    coefs.append(lassoReg.coef_)
    dictErrs[cv_err] = alpha

# Running LASSO with optimal alpha
lassoReg = Lasso(alpha = dictErrs[min(cv_errs)], normalize = True) # running LASSO with best alpha
lassoReg = lassoReg.fit(X_train, y_train)

# Printing resuls from LASSOreg
lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
dict_list = []
for word in X.columns:
    dict_list.append(str(word_to_DoD[word]).replace("{", "").replace("}", "").replace("'", ""))
lasso_coef['associated_dicts'] = dict_list
lasso_coef.sort_values(by='val_lasso', ascending=False)


Unnamed: 0,var,val_lasso,associated_dicts
44,best interest,452095.859928,"Social_justice, Well_being"
15,near future,353123.280562,Tech_optimism
98,exploiting,229907.837625,Free_rider
24,prescribe,146623.525644,Carrots
59,imminent,126918.426686,Tech_optimism
...,...,...,...
64,lower carbon,-120635.522876,FF_solutionism
65,fusion,-121346.252227,Tech_optimism
80,carbon footprint,-160759.916169,Whataboutism
39,mutually beneficial,-172851.847702,Carrots
