In [158]:
### Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.api import OLS

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

import csv
import nltk
import statistics
from nltk.corpus import stopwords # Importing stop words (e.g., the, and, a, of, etc.)
pd.options.mode.chained_assignment = None  # default='warn'


In [203]:
### Importing data

# (1)
# Corporate giving dataset - local
complete_donations_df = pd.read_excel("Oil_corporations_NTEE_Data_MASTER_SHEET.xlsx", sheet_name = "Individual_donations")

# Dropping irrelevant columns (those not in data dictionary) from corporate philanthropy dataframe
donations_df = complete_donations_df[["grantmaker_name", "year", "recipient_name", "NTEE_code",
                                      "NTEE_category", "Grant Amount (2020 Dollars)",
                                      "recipient_city", "recipient_state"]]

# Renaming Grant Amount (2020 Dollars) to not include spaces & converting to int
donations_df = donations_df.rename(columns = {"Grant Amount (2020 Dollars)": "grant_amount"})
donations_df["grant_amount"] = donations_df["grant_amount"]


# (2)
# Text data - local
text_df = pd.read_excel("ENVS_documents_for_text_analysis.xlsx")
text_df.drop(columns={'Researcher', 'Word_counts'}, inplace = True)


# (3)
# Discourses of Delay - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

delay_df = pd.read_csv(spreadsheet_url, header=0)

simple_delay_names = {"Individualism": "Individualism", "Whataboutism":
             "Whataboutism", "Doomism": "Doomism",
             "The 'free rider' excuse": 'Free_rider',
             "All talk, little action": 'Talk_no_action',
             "Fossil fuel solutionism": 'FF_solutionism',
             "No sticks, just carrots": 'Carrots',
             "Technological optimism": 'Tech_optimism',
             "Appeal to well-being": 'Well_being',
             "Policy perfectionism": 'Perfect_policy',
             "Appeal to social justice": 'Social_justice',
             "Change is impossible": 'Change_impossible'}

# Converting dataframe to dictionary with DoD words in list format
complete_discourse_dict = {}
for row in delay_df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    complete_discourse_dict[simple_delay_names[delay_method]] = dict_words

# Making copy of complete_text_df
discourse_dict = complete_discourse_dict.copy()

# Pulling tech_optimism
techOptimism = discourse_dict['Tech_optimism']

# (4)
# Importing annualized_donations - local
annualized_donations_df = pd.read_excel('annualized_donations.xlsx')

# (5)
# Making simple group names
simpleGroupNames = {"american forests": "af", "world wildlife fund": "wwf",
                     "sierra club": 'sc', "nature conservancy": "tnc",
                     "natural resources defense council": "nrdc",
                     "national fish and wildlife foundation": "nfwf",
                     "environmental defense fund": "edf",
                     "conservation international": "ci",
                     "audubon society": "nas", "ocean conservancy": "oc",
                     "earthjustice": "ej", 'greenpeace': 'gp'}


In [204]:
### Data wrangling

# (1)
# Text cleaning
# Importing punctuation and stopwords
stopWords = set(stopwords.words('english'))
table_punctuation = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’') 

# Cleaning text data
textCleaned = [] # Creating an empty list to store list of cleaned words
for row in text_df["Document_text"]: # Looping through each Tweet in ukraineRussia_df
    rowCleaned = [] # Creating an empty list to store cleaned words from each Tweet
    row_as_list = str(row).split() # Splitting row into a list of words at ' '
    for word in row_as_list: # Looping through each word in row_as_list
        if word not in stopWords and word != "nan":
            text = word.translate(table_punctuation) # Translating punctuation into ''
            textLower = text.lower() # Converting text to lowercase
            rowCleaned.append(textLower) # Appending cleaned word to rowCleaned list
    textCleaned.append(rowCleaned)  # Appending rowCleaned to textCleaned list
text_df["cleaned_text"] = textCleaned


# (2)
# Creating document-term matrix
delay_types = ['Tech_optimism']

# Creating copy of text_df to work with
regression_df = text_df.copy()
    
# Creating DoD_results dict
DoD_results = {}

wordsInText = []
# Looping through each DoD word
for col in techOptimism:
    wordAppearance = []
    # Looping through each entry in text_df["cleaned_text"]
    for text in text_df["cleaned_text"]:
        mySum = 0
        prevWord = ""
        for word in text:
            # Incrementing if word in DoD vocabulary
            if word == col:
                mySum += 1
            bigram = prevWord + " " + word
            # Incrementing if bigram in DoD vocabulary
            if bigram == col:
                mySum += 1
            prevWord = word  
        wordAppearance.append(mySum)
    # Adding word to regression_df if it appears in corpus
    if (sum(wordAppearance) > 0):
        regression_df[col] = wordAppearance
        regression_df = regression_df.copy()
        wordsInText.append(col)

# (3)
# Adding donation information to regression_df
annualized_donations_df = annualized_donations_df.rename(
    columns = {"recipient_name": "Organization_name", "year": "Document_year"}) # Renaming group column
regression_df["Organization_name"] = regression_df["Organization_name"].str.lower() # Converting name to lowercase
regression_df = pd.merge(regression_df, annualized_donations_df, on = [
    'Organization_name', 'Document_year'], how = 'outer') # Mergining on organization name and document year
regression_df["grant_amount"] = regression_df["grant_amount"].fillna(0)


# (4)
# Computing word counts for each entry
wordCounts = []
for entry in regression_df["cleaned_text"]:
    wordCounts.append(len(str(entry).split()))
regression_df["Word_counts"] = wordCounts


In [212]:
### Regression with dictionary counts as independent variables

# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Creating variables data
variables = wordsInText.copy()
variables = set(variables + ['grant_amount', 'Word_counts', 'Document_year', 'Organization_name'])

# Creating techOptimsm frequency measure
reg_df = regression_df[variables]
reg_df['techOpt_frequency'] = [0] * len(reg_df['grant_amount'])
for word in wordsInText:
    reg_df['techOpt_frequency'] += reg_df[word]
reg_df['techOpt_frequency'] = reg_df['techOpt_frequency'] / wordCounts * 100
    
# Dropping null values
reg_df.dropna(inplace = True)

# # Adding group controls
# groupList = list(reg_df['Organization_name'].unique())
# groupList.remove('nature conservancy')

# ivList = []
# for group in groupList:
#     iv = simpleGroupNames[group] + '_indc'
#     reg_df[iv] = np.where(reg_df['Organization_name'].str.contains(group), 1, 0)
#     ivList.append(iv)

# # Dropping greenpeace and earthjustice
# reg_df = reg_df[~reg_df['Organization_name'].isin(['greenpeace', 'earthjustice'])]
    
# Splitting data into X and Y
y = reg_df['grant_amount']
X = reg_df[['techOpt_frequency', 'Document_year']]

# Creating linear regression
olsReg = sm.OLS(y, X).fit()
print(olsReg.summary())


                                 OLS Regression Results                                
Dep. Variable:           grant_amount   R-squared (uncentered):                   0.088
Model:                            OLS   Adj. R-squared (uncentered):              0.088
Method:                 Least Squares   F-statistic:                              265.5
Date:                Sun, 15 May 2022   Prob (F-statistic):                   8.96e-111
Time:                        10:40:17   Log-Likelihood:                         -77963.
No. Observations:                5478   AIC:                                  1.559e+05
Df Residuals:                    5476   BIC:                                  1.559e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------