In [227]:
### Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import normalize
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import csv
import nltk
import statistics
from nltk.corpus import stopwords # Importing stop words (e.g., the, and, a, of, etc.)
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/finnianlowden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [146]:
### Importing data

# (1)
# Corporate giving dataset - local
complete_donations_df = pd.read_excel("Oil_corporations_NTEE_Data_MASTER_SHEET.xlsx", sheet_name = "Individual_donations")

# (2)
# Text data
# Adding experiment text data - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1Zr4SQFxq8u3FnQwRyIHCoQ1Az_lb1PXd45Dhkni7Uok/edit#gid=570879331"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=") # Online
experiment_text_df = pd.read_csv(spreadsheet_url, header=0) # Online
# Adding control text data
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1Zr4SQFxq8u3FnQwRyIHCoQ1Az_lb1PXd45Dhkni7Uok/edit#gid=1393581184"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=") # Online
control_text_df = pd.read_csv(spreadsheet_url, header=0) # Online
# Joining control and experiment
complete_text_df = pd.concat([experiment_text_df, control_text_df])

# (3)
# Discourses of Delay - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

delay_df = pd.read_csv(spreadsheet_url, header=0)

simple_delay_names = {"Individualism": "Individualism", "Whataboutism":
             "Whataboutism", "Doomism": "Doomism",
             "The 'free rider' excuse": 'Free_rider',
             "All talk, little action": 'Talk_no_action',
             "Fossil fuel solutionism": 'FF_solutionism',
             "No sticks, just carrots": 'Carrots',
             "Technological optimism": 'Tech_optimism',
             "Appeal to well-being": 'Well_being',
             "Policy perfectionism": 'Perfect_policy',
             "Appeal to social justice": 'Social_justice',
             "Change is impossible": 'Change_impossible'}

complete_discourse_dict = {}
for row in delay_df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    complete_discourse_dict[simple_delay_names[delay_method]] = dict_words


In [239]:
### Formatting dataframes

# Working with corporate philanthropy data
# Dropping irrelevant columns (those not in data dictionary)
donations_df = complete_donations_df[["grantmaker_name", "year", "recipient_name", "NTEE_code",
                                      "NTEE_category", "Grant Amount (2020 Dollars)",
                                      "recipient_city", "recipient_state"]]

# Renaming Grant Amount (2020 Dollars) to not include spaces & converting to int
donations_df = donations_df.rename(columns = {"Grant Amount (2020 Dollars)": "grant_amount"})
donations_df["grant_amount"] = donations_df["grant_amount"]

# Making copy of complete_text_df
text_df = complete_text_df.copy()
text_df.drop(columns={'Researcher'}, inplace = True)

# Making copy of complete_text_df
discourse_dict = complete_discourse_dict.copy()

# Checking to make sure changes were made
# donations_df.head()
# text_df.head()
# discourse_dict


In [240]:
### Data wrangling

# (1)
# Text cleaning
# Importing punctuation and stopwords
stopWords = set(stopwords.words('english'))
table_punctuation = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’') 

# Cleaning text data
textCleaned = [] # Creating an empty list to store list of cleaned words
for row in text_df["Document_text"]: # Looping through each Tweet in ukraineRussia_df
    rowCleaned = [] # Creating an empty list to store cleaned words from each Tweet
    row_as_list = str(row).split() # Splitting row into a list of words at ' '
    for word in row_as_list: # Looping through each word in row_as_list
        if word not in stopWords and word != "nan":
            text = word.translate(table_punctuation) # Translating punctuation into ''
            textLower = text.lower() # Converting text to lowercase
            rowCleaned.append(textLower) # Appending cleaned word to rowCleaned list
    textCleaned.append(rowCleaned)  # Appending rowCleaned to textCleaned list

text_df["cleaned_text"] = textCleaned


# (2)
# Creating document-term matrix
# Getting list of Discourses of Delay
delay_types = list(discourse_dict)

# Getting all words in DoD dictionaries
delay_vocabulary = set()
for delay in delay_types:
    delay_vocabulary.update(discourse_dict[delay])
    
regression_df = text_df.copy()
    
for col in delay_vocabulary:
    wordAppearance = []
    for text in text_df["cleaned_text"]:
        mySum = 0
        prevWord = ""
        for word in text:
            if word == col:
                mySum += 1
            bigram = prevWord + " " + word
            if bigram == col:
                mySum += 1
            prevWord = word
        wordAppearance.append(mySum)
    if (sum(wordAppearance) > 0):
        regression_df[col] = wordAppearance

# (3)
# Adding donation information
# Adding amount of donation in given year
reduce_donations_df = donations_df.copy()
group_list = ['nature conservancy', 'american forests', 'national fish and wildlife foundation',
 'natural resources defense council', 'conservation international', 'world wildlife fund',
 'sierra club', 'ocean conservancy', 'environmental defense fund', 'audubon society']
reduce_donations_df["recipient_name"] = reduce_donations_df["recipient_name"].str.lower()
boolean_series = reduce_donations_df["recipient_name"].isin(group_list)
reduce_donations_df = reduce_donations_df[boolean_series]

# Grouping by year and group
annualized_donations_df = reduce_donations_df.groupby(
    ['recipient_name', 'year'], as_index = False).agg({'grant_amount': sum})
annualized_donations_df = pd.DataFrame(annualized_donations_df)
# annualized_donations_df.to_excel("Output.xlsx", index = False) # code to download as XSLX

# Adding donations to text_df
annualized_donations_df = annualized_donations_df.rename(
    columns = {"recipient_name": "Organization_name", "year": "Document_year"}) # Renaming group column
regression_df["Organization_name"] = regression_df["Organization_name"].str.lower()
regression_df = pd.merge(regression_df, annualized_donations_df, on = ['Organization_name', 'Document_year'], how = 'outer')
regression_df["grant_amount"] = regression_df["grant_amount"].fillna(0)

# Adding indicator for recieving a donation
regression_df['recieved_donation'] = np.where(regression_df['grant_amount'] > 0, 1, 0)

print("complete!")

  regression_df[col] = wordAppearance


complete!


  regression_df['recieved_donation'] = np.where(regression_df['grant_amount'] > 0, 1, 0)


In [241]:
### Creating data for regressions
logit_df = regression_df.copy()

# Adding indicator for recieving a donation
logit_df['experiment_group'] = np.where(
    logit_df['Organization_name'].isin({"greenpeace", "earthjustice"}), 0, 1)

logit_df.to_excel("logit_df_to_view.xlsx")

# Dropping null values
logit_df.dropna(inplace = True)

# Creating training and testing splits from logit_df
y = logit_df['experiment_group']
X = logit_df.drop(columns = ["experiment_group", "recieved_donation", "grant_amount", "Organization_name", "Document_title",
                                  "Document_type", "Reference", "Document_text", "Word_counts",
                                  "cleaned_text", "Document_year"])

# Making sure all non-numeric columns and NaN values have been dropped
X.replace([np.inf, -np.inf], np.nan, inplace = True)

# Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)


In [242]:
### Logistic-LASSO Model
logReg = LogisticRegressionCV(Cs = 20, cv = 5, penalty = 'l1', solver = 'liblinear',
                              refit = True, class_weight = 'balanced')
logReg = logReg.fit(X_train, y_train)

# Printing coefficient data
coef = pd.DataFrame({'var':X.columns, 'val_lasso':logReg.coef_[0]})
coef.sort_values(by = ['val_lasso'], inplace = True)
coef.to_excel("logReg_coef.xlsx", index = False)

# Predicted probability for text recieving donation
ypred_train = logReg.predict_proba(X_train)[:, 1]
ypred_test = logReg.predict_proba(X_test)[:, 1]

print("Predictive accuracy with training data: " + str(statistics.mean(ypred_train)))
print("Predictive accuracy with testing data: " + str(statistics.mean(ypred_test)))

Predictive accuracy with training data: 0.5429170914156392
Predictive accuracy with testing data: 0.54557802992392
