In [14]:
### Imports

import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.api import OLS
from math import sqrt

import csv
import nltk
import statistics
from nltk.corpus import stopwords # Importing stop words (e.g., the, and, a, of, etc.)
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
pd.options.mode.chained_assignment = None  # default='warn'


In [2]:
### Importing data

# (1)
# Corporate giving dataset - local
complete_donations_df = pd.read_excel("Oil_corporations_NTEE_Data_MASTER_SHEET.xlsx", sheet_name = "Individual_donations")

# Dropping irrelevant columns (those not in data dictionary) from corporate philanthropy dataframe
donations_df = complete_donations_df[["grantmaker_name", "year", "recipient_name", "NTEE_code",
                                      "NTEE_category", "Grant Amount (2020 Dollars)",
                                      "recipient_city", "recipient_state"]]

# Renaming Grant Amount (2020 Dollars) to not include spaces & converting to int
donations_df = donations_df.rename(columns = {"Grant Amount (2020 Dollars)": "grant_amount"})
donations_df["grant_amount"] = donations_df["grant_amount"]


# (2)
# Text data - local
text_df = pd.read_excel("ENVS_documents_for_text_analysis.xlsx")
text_df.drop(columns={'Researcher', 'Word_counts'}, inplace = True)


# (3)
# Discourses of Delay - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

delay_df = pd.read_csv(spreadsheet_url, header=0)

simple_delay_names = {"Individualism": "Individualism", "Whataboutism":
             "Whataboutism", "Doomism": "Doomism",
             "The 'free rider' excuse": 'Free_rider',
             "All talk, little action": 'Talk_no_action',
             "Fossil fuel solutionism": 'FF_solutionism',
             "No sticks, just carrots": 'Carrots',
             "Technological optimism": 'Tech_optimism',
             "Appeal to well-being": 'Well_being',
             "Policy perfectionism": 'Perfect_policy',
             "Appeal to social justice": 'Social_justice',
             "Change is impossible": 'Change_impossible'}

# Converting dataframe to dictionary with DoD words in list format
complete_discourse_dict = {}
for row in delay_df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    complete_discourse_dict[simple_delay_names[delay_method]] = dict_words

# Making copy of complete_text_df
discourse_dict = complete_discourse_dict.copy()

# Pulling tech_optimism
techOptimism = discourse_dict['Tech_optimism']


# (4)
# Importing annualized_donations - local
annualized_donations_df = pd.read_excel('annualized_donations.xlsx')


In [15]:
### Data wrangling

# (1)
# Text cleaning
# Importing punctuation and stopwords
stopWords = set(stopwords.words('english'))
table_punctuation = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’') 

# Cleaning text data
textCleaned = [] # Creating an empty list to store list of cleaned words
for row in text_df["Document_text"]: # Looping through each Tweet in ukraineRussia_df
    rowCleaned = [] # Creating an empty list to store cleaned words from each Tweet
    row_as_list = str(row).split() # Splitting row into a list of words at ' '
    for word in row_as_list: # Looping through each word in row_as_list
        if word not in stopWords and word != "nan":
            text = word.translate(table_punctuation) # Translating punctuation into ''
            textLower = text.lower() # Converting text to lowercase
            rowCleaned.append(textLower) # Appending cleaned word to rowCleaned list
    textCleaned.append(rowCleaned)  # Appending rowCleaned to textCleaned list
text_df["cleaned_text"] = textCleaned


# (2)
# Creating document-term matrix
delay_types = ['Tech_optimism']

# Creating copy of text_df to work with
regression_df = text_df.copy()
    
# Creating DoD_results dict
DoD_results = {}

wordsInText = []
# Looping through each DoD word
for col in techOptimism:
    # Stemming col
    col = stemmer.stem(col)
    
    wordAppearance = []
    # Looping through each entry in text_df["cleaned_text"]
    for text in text_df["cleaned_text"]:
        mySum = 0
        prevWord = ""
        for word in text:
            # Stemming word
            word = stemmer.stem(word)
            
            # Incrementing if word in DoD vocabulary
            if word == col:
                mySum += 1
            bigram = prevWord + " " + word
            # Incrementing if bigram in DoD vocabulary
            if bigram == col:
                mySum += 1
            prevWord = word  
        wordAppearance.append(mySum)
    # Adding word to regression_df if it appears in corpus
    if (sum(wordAppearance) > 0):
        regression_df[col] = wordAppearance
        regression_df = regression_df.copy()
        wordsInText.append(col)

# (3)
# Adding donation information to regression_df
annualized_donations_df = annualized_donations_df.rename(
    columns = {"recipient_name": "Organization_name", "year": "Document_year"}) # Renaming group column
regression_df["Organization_name"] = regression_df["Organization_name"].str.lower() # Converting name to lowercase
regression_df = pd.merge(regression_df, annualized_donations_df, on = [
    'Organization_name', 'Document_year'], how = 'outer') # Mergining on organization name and document year
regression_df["grant_amount"] = regression_df["grant_amount"].fillna(0)


# (4)
# Computing word counts for each entry
wordCounts = []
for entry in regression_df["cleaned_text"]:
    wordCounts.append(len(str(entry).split()))
regression_df["Word_counts"] = wordCounts


In [19]:
### Formatting regression dataframe

# (1)
# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Creating variables data
variables = wordsInText.copy()
variables = set(variables + ['grant_amount', 'Word_counts', 'Document_year', 'Organization_name'])


# (2)
# Creating techOptimsm frequency measure
reg_df = regression_df[variables]
reg_df['techOpt_frequency'] = [0] * len(reg_df['grant_amount'])
for word in wordsInText:
    reg_df['techOpt_frequency'] += reg_df[word]
reg_df['techOpt_intensity'] =   reg_df['techOpt_frequency']
reg_df['techOpt_frequency'] = reg_df['techOpt_frequency'] / wordCounts * 100
  
# Dropping null values
reg_df.dropna(inplace = True)


# (3)
# Creating control data
control_df = reg_df[reg_df['Organization_name'].isin(['greenpeace', 'earthjustice'])]
controlGroupDict = {}
for year in list(range(1980, 2021)):
    controlGroupDict[year] = {'greenpeace': {"count": 0, "length": 0}, 'earthjustice': {"count": 0, "length": 0}}

for row in control_df.iterrows():
    group = row[1]["Organization_name"]
    year = row[1]["Document_year"]
    
    # Updating dictionary
    controlGroupDict[year][group]["count"] += row[1]["techOpt_intensity"]
    controlGroupDict[year][group]["length"] += row[1]["Word_counts"]
    
# Calculating techOpt prevelance in control data
controlDict = {}
for year in controlGroupDict:
    totalCount = 0
    totalLength = 0
    for group in controlGroupDict[year]:
        totalCount += controlGroupDict[year][group]["count"]
        totalLength += controlGroupDict[year][group]["length"]
    if totalLength > 0:
        controlDict[year] = totalCount / totalLength * 100
    else:
        controlDict[year] = 0

# Using linear interpolation to fill in the gaps
# Still significant even if removed
controlDict[2001] = (controlDict[2000] + controlDict[2002])/2
controlDict[2004] = (controlDict[2003]*2 + controlDict[2006])/3
controlDict[2005] = (controlDict[2003] + controlDict[2006]*2)/3


# (4)
# Dropping greenpeace and earthjustice from reg_df
reg_df = reg_df[~reg_df['Organization_name'].isin(['greenpeace', 'earthjustice'])]


In [20]:
### Running regression

# Splitting data into X and Y
y = reg_df['grant_amount']
X = reg_df[['Document_year', 'techOpt_frequency']]

# Renaming columns
X = X.rename(columns={'techOpt_frequency': 'exp_techOpt_freq', 'Document_year': 'year'})

# Adding control data to dataframe
correspondingControlList = []
for year in X["year"]:
    correspondingControlList.append(controlDict[year])
X['control_techOpt_freq'] = correspondingControlList

# Creating linear regression
olsReg = sm.OLS(y, X).fit()
print(olsReg.summary())


                                 OLS Regression Results                                
Dep. Variable:           grant_amount   R-squared (uncentered):                   0.135
Model:                            OLS   Adj. R-squared (uncentered):              0.135
Method:                 Least Squares   F-statistic:                              246.3
Date:                Mon, 16 May 2022   Prob (F-statistic):                   1.80e-148
Time:                        10:55:10   Log-Likelihood:                         -67374.
No. Observations:                4718   AIC:                                  1.348e+05
Df Residuals:                    4715   BIC:                                  1.348e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------