In [1]:
### Data dictionaries

# donations_df data dictionary
# Variable                    Type       Description
# Grantmaker_name             String     Corporation/foundation that gave grant
# Year                        Int        Year grant was given
# Recipient_name              String     Organization that recived grant
# NTEE_code                   String     NTEE code of organization given grant
# NTEE_category               String     Broader category of organization according to IRS
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars
# Recipient_city              String     City of recipient organization
# Recipient_state             String     State of recipient organization


# lasso_df (1) data dictionary
# Variable                    Type       Description
# Individualism               Int        Measure of prevalence of this discourse of delay (DoD) in the text
# The 'free rider' excuse     Int        Measure of prevalence of this DoD in the text in given year
# Whataboutism                Int        Measure of prevalence of this DoD in the text in given year
# All talk, little action     Int        Measure of prevalence of this DoD in the text in given year
# Fossil fuel solutionism     Int        Measure of prevalence of this DoD in the text in given year
# No sticks, just carrots     Int        Measure of prevalence of this DoD in the text in given year
# Technological optimism      Int        Measure of prevalence of this DoD in the text in given year
# Appeal to well-being        Int        Measure of prevalence of this DoD in the text in given year
# Policy perfectionism        Int        Measure of prevalence of this DoD in the text in given year
# Appeal to social justice    Int        Measure of prevalence of this DoD in the text in given year
# Change is impossible        Int        Measure of prevalence of this DoD in the text in given year
# Doomism                     Int        Measure of prevalence of this DoD in the text in given year
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars


# lasso_df (2) data dictionary
# Variable                    Type       Description
# Word w                      Int        Measure of prevalence of word w in the text (all DoD words)
# Grant Amount                Float      Grant amount adjusted for inflation to 2020 dollars


In [1]:
### Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.api import OLS

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import csv
import nltk
import statistics
from nltk.corpus import stopwords # Importing stop words (e.g., the, and, a, of, etc.)
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/finnianlowden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
### Importing data

# (1)
# Corporate giving dataset - local
complete_donations_df = pd.read_excel("Oil_corporations_NTEE_Data_MASTER_SHEET.xlsx", sheet_name = "Individual_donations")

# Dropping irrelevant columns (those not in data dictionary) from corporate philanthropy dataframe
donations_df = complete_donations_df[["grantmaker_name", "year", "recipient_name", "NTEE_code",
                                      "NTEE_category", "Grant Amount (2020 Dollars)",
                                      "recipient_city", "recipient_state"]]

# Renaming Grant Amount (2020 Dollars) to not include spaces & converting to int
donations_df = donations_df.rename(columns = {"Grant Amount (2020 Dollars)": "grant_amount"})
donations_df["grant_amount"] = donations_df["grant_amount"]


# (2)
# Text data - local
text_df = pd.read_excel("ENVS_documents_for_text_analysis.xlsx")
text_df.drop(columns={'Researcher', 'Word_counts'}, inplace = True)


# (3)
# Discourses of Delay - online
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

delay_df = pd.read_csv(spreadsheet_url, header=0)

simple_delay_names = {"Individualism": "Individualism", "Whataboutism":
             "Whataboutism", "Doomism": "Doomism",
             "The 'free rider' excuse": 'Free_rider',
             "All talk, little action": 'Talk_no_action',
             "Fossil fuel solutionism": 'FF_solutionism',
             "No sticks, just carrots": 'Carrots',
             "Technological optimism": 'Tech_optimism',
             "Appeal to well-being": 'Well_being',
             "Policy perfectionism": 'Perfect_policy',
             "Appeal to social justice": 'Social_justice',
             "Change is impossible": 'Change_impossible'}

# Converting dataframe to dictionary with DoD words in list format
complete_discourse_dict = {}
for row in delay_df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    complete_discourse_dict[simple_delay_names[delay_method]] = dict_words

# Making copy of complete_text_df
discourse_dict = complete_discourse_dict.copy()

# (4)
# Importing annualized_donations - local
annualized_donations_df = pd.read_excel('annualized_donations.xlsx')


In [25]:
### Data wrangling

# (1)
# Text cleaning
# Importing punctuation and stopwords
stopWords = set(stopwords.words('english'))
table_punctuation = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’') 

# Cleaning text data
textCleaned = [] # Creating an empty list to store list of cleaned words
for row in text_df["Document_text"]: # Looping through each Tweet in ukraineRussia_df
    rowCleaned = [] # Creating an empty list to store cleaned words from each Tweet
    row_as_list = str(row).split() # Splitting row into a list of words at ' '
    for word in row_as_list: # Looping through each word in row_as_list
        if word not in stopWords and word != "nan":
            text = word.translate(table_punctuation) # Translating punctuation into ''
            textLower = text.lower() # Converting text to lowercase
            rowCleaned.append(textLower) # Appending cleaned word to rowCleaned list
    textCleaned.append(rowCleaned)  # Appending rowCleaned to textCleaned list
text_df["cleaned_text"] = textCleaned


# (2)
# Creating document-term matrix
# Getting list of Discourses of Delay
best_dicts = ["FF_solutionism", "Well_being", "Social_justice", "Carrots"]
top6_dicts = ["FF_solutionism", "Well_being", "Social_justice", "Carrots", "Free_rider", "Whataboutism"]
all_dicts = list(discourse_dict)
delay_types = all_dicts

# Getting all words in DoD dictionaries
delay_vocabulary = set()
for delay in delay_types:
    delay_vocabulary.update(discourse_dict[delay])

# Creating copy of text_df to work with
regression_df = text_df.copy()
    
# Creating DoD_results dict
DoD_results = {}

# Looping through each DoD word
for col in delay_vocabulary:
    wordAppearance = []
    # Looping through each entry in text_df["cleaned_text"]
    for text in text_df["cleaned_text"]:
        mySum = 0
        prevWord = ""
        for word in text:
            # Incrementing if word in DoD vocabulary
            if word == col:
                mySum += 1
            bigram = prevWord + " " + word
            # Incrementing if bigram in DoD vocabulary
            if bigram == col:
                mySum += 1
            # Creating DoD results dict
            if word == col or bigram == col:
                for delay in discourse_dict:
                    og_words = [x.lower() for x in discourse_dict[delay]]
                    if word in og_words:                
                        if delay not in set(DoD_results):
                            DoD_results[delay] = {word}
                        else:
                            DoD_results[delay].add(word)
                    if bigram in og_words:                
                        if delay not in set(DoD_results):
                            DoD_results[delay] = {bigram}
                        else:
                            DoD_results[delay].add(bigram)    
            prevWord = word  
        wordAppearance.append(mySum)
    # Adding word to regression_df if it appears in corpus
    if (sum(wordAppearance) > 0):
        regression_df[col] = wordAppearance
        regression_df = regression_df.copy()

# Creating dict of words with their associated dictionaries      
word_to_DoD = {}
for delay in DoD_results:
    wordSet = DoD_results[delay]
    for word in wordSet:
        if word not in word_to_DoD:
            word_to_DoD[word] = {delay}
        else:
            word_to_DoD[word].add(delay)

# (3)
# Adding donation information to regression_df
annualized_donations_df = annualized_donations_df.rename(
    columns = {"recipient_name": "Organization_name", "year": "Document_year"}) # Renaming group column
regression_df["Organization_name"] = regression_df["Organization_name"].str.lower() # Converting name to lowercase
regression_df = pd.merge(regression_df, annualized_donations_df, on = [
    'Organization_name', 'Document_year'], how = 'outer') # Mergining on organization name and document year
regression_df["grant_amount"] = regression_df["grant_amount"].fillna(0)


# (4)
# Computing word counts for each entry
wordCounts = []
for entry in regression_df["cleaned_text"]:
    wordCounts.append(len(entry))
regression_df["Word_counts"] = wordCounts


TypeError: object of type 'float' has no len()

In [10]:
### Regression with dictionary counts as independent variables

# Formatting data for regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_text'})

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Combining word counts into dictionary counts
dict_sums = {} # Creating dict to store results

# Removing words from discourse_dict
wordsInText = list(lasso_df.drop(columns = {'grant_amount'}))
discourse_dict_reg = discourse_dict.copy()

for delay in discourse_dict_reg:
    tempList = discourse_dict_reg[delay].copy() # Creating copy to not modify original list
    for word in discourse_dict_reg[delay]:
        if word not in wordsInText:
            tempList.remove(word)
    discourse_dict_reg[delay] = tempList # Adjusting pointer to copy
    dict_sums[delay] = list(pd.DataFrame.sum(X[discourse_dict_reg[delay]], axis=1))
    
# Converting dictionary to DataFrame
X = pd.DataFrame.from_dict(dict_sums)
# X['year'] = list(lasso_df['Document_year']) # Adding year to regression

# Lasso Regression (ends up being the same as a linear regression)
# # Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# # Creating variables to store alphas
# dictErrs = {}
# cv_errs = []
# coefs = []

# # Running a LASSO regression here
# potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
# for alpha in potentialAlphas:
#     lassoReg = Lasso(alpha = alpha, normalize = True)
#     lassoReg.fit(X_train, y_train)
#     y_pred = lassoReg.predict(X_test)
#     cv_err = np.mean((y_pred - y_test)**2)
#     cv_errs.append(cv_err)
#     coefs.append(lassoReg.coef_)
#     dictErrs[cv_err] = alpha

# # Running LASSO with optimal alpha
# lassoReg = Lasso(alpha = dictErrs[min(cv_errs)],
#                  normalize = True) # running LASSO with best alpha (when 0 -> OLS regression)
# lassoReg = lassoReg.fit(X_train, y_train)

# # Printing resuls from LASSOreg
# lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
# lasso_coef.sort_values(by='val_lasso', ascending=False)

# Creating linear regression
olsReg = LinearRegression()
olsReg = olsReg.fit(X, y)

# Printing resuls from olsReg
ols_coef = pd.DataFrame({'var':X.columns, 'val_ols':olsReg.coef_})
ols_coef.sort_values(by='val_ols', ascending=False)

# Trying with statsModels
# X = sm.add_constant(X)
# result = sm.OLS(y, X.to_numpy()).fit()
# result.summary()


Unnamed: 0,var,val_ols
6,Tech_optimism,34745.642918
2,Whataboutism,-3548.54425
3,Talk_no_action,-5136.307351
0,Individualism,-7108.644318
5,Carrots,-7863.0712
11,Doomism,-27298.280201
1,Free_rider,-28361.186936
7,Well_being,-28661.936172
9,Social_justice,-29888.692507
10,Change_impossible,-41152.360446


In [5]:
### Lasso reg with words as independent variables

# Formatting data for Lasso regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_year', 'Document_text'})

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# Creating variables to store alphas
dictErrs = {}
cv_errs = []
coefs = []

# Running a LASSO regression here
potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
for alpha in potentialAlphas:
    lassoReg = Lasso(alpha = alpha, normalize = True)
    lassoReg.fit(X_train, y_train)
    y_pred = lassoReg.predict(X_test)
    cv_err = np.mean((y_pred - y_test)**2)
    cv_errs.append(cv_err)
    coefs.append(lassoReg.coef_)
    dictErrs[cv_err] = alpha

# Running LASSO with optimal alpha
lassoReg = Lasso(alpha = dictErrs[min(cv_errs)], normalize = True) # running LASSO with best alpha
lassoReg = lassoReg.fit(X_train, y_train)

# Printing resuls from LASSOreg
lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
dict_list = []
for word in X.columns:
    dict_list.append(str(word_to_DoD[word]).replace("{", "").replace("}", "").replace("'", ""))
lasso_coef['associated_dicts'] = dict_list
lasso_coef.sort_values(by='val_lasso', ascending=False)


Unnamed: 0,var,val_lasso,associated_dicts
22,best interest,452100.097981,"Well_being, Social_justice"
11,near future,353126.303556,Tech_optimism
17,exploiting,229907.757473,Free_rider
45,prescribe,146623.572675,Carrots
53,imminent,126918.110689,Tech_optimism
...,...,...,...
66,lower carbon,-120636.986416,FF_solutionism
38,fusion,-121346.265334,Tech_optimism
51,carbon footprint,-160758.018198,Whataboutism
40,mutually beneficial,-172858.665181,Carrots


In [54]:
### Pulling out text excerpts with most delay rhetoric

temp_df = lasso_coef[lasso_coef['val_lasso'] > 0].sort_values(by='val_lasso', ascending=False)

for word in list(temp_df['var']):
    print(word)
    for row in regression_df['Document_text']:
        delay_sentences = row.split(".")
        for sentence in delay_sentences:
            if word in sentence:
                 print("\n" + sentence)
    print("####################################################################")


best interest

 It’s in everyone’s best interest

 Every person, business, and government can and must act in the best interest of their environment

3 million metric tons more carbon per year than conventional crude, proving that the project fails the President’s climate test and is not in the nation’s best interests

 “It’s in the best interest of our business and our sustainable development goals to make sure that forests are managed well
####################################################################
near future

 According to WWF’s Living Planet Report,* we have lost one-third of the planet’s natural wealth over the last 30 years, a trend unlikely to improve in the near future given the increasing “footprint” that mankind leaves on the Earth – now visible even in the remotest wilderness areas

 In the near future, we must expand the Conservancy's on-the-ground presence in communities around the world

 While challenges to their long-term survival remain, with continued progre

 As I travel across the country by train, I see the visible signs of new life as women transplant rice; but also the trail of destruction and death that the monsoon rains bring every year in a land where millions are dependent on agriculture for their survival - a grim forecast of climate change in future

 Thanks to the complex suite of software designed by our nuclear staff, NRDC can simulate the grim effects of U

The news we hear about global warming these days is usually grim: rising temperatures, melting ice caps, and burning forests

” This stark frontpage headline in the International Herald Tribune of 26 October 2007 reflected the grim message of the United Nations’ fourth Global Environmental Outlook report: “The human population is living far beyond its means and inflicting damage on the environment that could pass the point of no return

  Despite the grim prognosis, experts agreed that intensive, targeted action could restore a viable population of coho to the Russian Rive

 For many the wolf represents the romantic call of the wild, yet ranchers and farmers fear its impact on their livestock

 Evolving is what I do best — but I fear for your future

 Landowners fearing federal restrictions barred scientists from entering their property to try to save the endangered Houston toad

 Previously, few landowners encouraged nesting by the woodpecker, fearing tighter regulation of their property would result if the birds took up residence

 An EDF approach called "safe harbor" aims to strengthen the Act by giving landowners an incentive to help endangered species without fear of incurring added legal obligations if the species increase in number

 In my heart, I share this weight, this nagging fear

 After nine years at the helm of Greenpeace International, during which time he has made fearless, bold and visionary contributions to Greenpeace globally, Gerd Leipold announced that he would be stepping down as at the end of October 2009

 No parent should have to 

 In China, an EDF competition helped highlight new monitoring technologies

 That said, competition for funds from all sectors, private and public, is fierce and the search for finance has never been more difficult or critical

 By holding a first-ever Artificial Intelligence competition for top data scientists, TNC spurred thousands of teams to develop algorithms designed to identify when a fish is caught and what type of fish it is

 “We’d like to see this plan ignite competition in other cities to reduce their pollution

 Let the competition for cleaner air begin

 Without wildfires to spark new growth and control competition from hardwoods and invasive species, the trees’ ability to regenerate weakened

 We’ve lent our scientific expertise to a competition to identify new preservatives for use in key products and to the creation of online databases where verified safer ingredients can be found

 Habitat loss, increased competition by invasive species, and insufficient cold, clear w

 EDF helped secure a commitment from Walmart to reduce 20 million metric tons of greenhouse gases from its supply chain by 2015

 The Great Egret has since come to symbolize Audubon's commitment to bird conservation and is part of our logo

 It is one of the largest corporate commitments ever to save
an endangered species

 The Micronesia Challenge—launched in 2006—is an ambitious commitment by five Micronesian governments to “effectively conserve at least 30 percent of the near-shore marine resources and 20 percent of the terrestrial resources across Micronesia by 2020

 But new commitments from the US and China offer reason for hope

 Turner was given this award for his extraordinary commitment to conservation

 WWF is using this energy and commitment to ensure that, by the end of the current decade, the numbers of elephants, rhinos, tigers, pandas, great apes, sea turtles, and whales are either stable or increasing and that their habitats are safeguarded

 One thing that remained co

 Fish and Wildlife Service, Michigan Department of Natural Resources, and Canadian and private sector partners

" A few years ago, the foundation helped launch the Conservancy's Japan program, a means of outreach to Japan's private sector

Since 1951, The Nature Conservancy has been the private sector leader in preserving ecosystems and the rare species and communities they shelter

 Meanwhile, we’re working closely with the private sector—architects, planners, and developers—to incorporate smart-growth principles in a large-scale way

 Already, LEED-ND is being picked up in influential ways by both the private sector and government

 These outcomes were made possible by an equally dramatic increase in strategic partnerships with governments, the private sector and nongovernmental organizations sharing complementary objectives

 Working with the International Partnership for Blue Carbon, CI will expand awareness and action by governments and the private sector while building pilot proj

In [55]:
list(temp_df['var'])

['best interest',
 'near future',
 'exploiting',
 'prescribe',
 'imminent',
 'voluntary',
 'way life',
 'grim',
 'sequestration',
 'investment',
 'socioeconomic',
 'inflation',
 'ambition',
 'fear',
 'negligible',
 'invest',
 'footprint',
 'competition',
 'compete',
 'lost',
 'commitment',
 'volunteer',
 'commitments',
 'innovation',
 'private sector',
 'disruptive']

In [18]:
### Robustness checks

# (1)
# Removing 'tech_optimism' to see if coeffients signs change

# Formatting data for regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_text'})

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Combining word counts into dictionary counts
dict_sums = {} # Creating dict to store results

# Removing words from discourse_dict
wordsInText = list(lasso_df.drop(columns = {'grant_amount'}))
discourse_dict_reg = discourse_dict.copy()

for delay in discourse_dict_reg:
    tempList = discourse_dict_reg[delay].copy() # Creating copy to not modify original list
    for word in discourse_dict_reg[delay]:
        if word not in wordsInText:
            tempList.remove(word)
    discourse_dict_reg[delay] = tempList # Adjusting pointer to copy
    dict_sums[delay] = list(pd.DataFrame.sum(X[discourse_dict_reg[delay]], axis=1))
    
# Converting dictionary to DataFrame
X = pd.DataFrame.from_dict(dict_sums).drop(columns={"Tech_optimism"})

# Creating linear regression
olsReg = LinearRegression()
olsReg = olsReg.fit(X, y)

# Printing resuls from olsReg
ols_coef = pd.DataFrame({'var':X.columns, 'val_ols':olsReg.coef_})
ols_coef.sort_values(by='val_ols', ascending=False)


Unnamed: 0,var,val_ols
2,Whataboutism,469.657339
3,Talk_no_action,-3005.035412
0,Individualism,-3825.988198
5,Carrots,-5364.552786
10,Doomism,-24087.62704
1,Free_rider,-25772.610912
8,Social_justice,-26090.157761
6,Well_being,-28029.478527
9,Change_impossible,-39370.174915
4,FF_solutionism,-44609.982591


In [19]:
# (2)
# Trying dict regression with frequency input (i.e., normalizing for length)

# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Formatting data for regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_text'})

# Converting intensity to frequency measures
for col in list(lasso_df):
    lasso_df[col] = lasso_df[col] / wordCounts

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Combining word counts into dictionary counts
dict_sums = {} # Creating dict to store results

# Removing words from discourse_dict
wordsInText = list(lasso_df.drop(columns = {'grant_amount'}))
discourse_dict_reg = discourse_dict.copy()

for delay in discourse_dict_reg:
    tempList = discourse_dict_reg[delay].copy() # Creating copy to not modify original list
    for word in discourse_dict_reg[delay]:
        if word not in wordsInText:
            tempList.remove(word)
    discourse_dict_reg[delay] = tempList # Adjusting pointer to copy
    dict_sums[delay] = list(pd.DataFrame.sum(X[discourse_dict_reg[delay]], axis=1))
    
# Converting dictionary to DataFrame
X = pd.DataFrame.from_dict(dict_sums)

# Creating linear regression
olsReg = LinearRegression()
olsReg = olsReg.fit(X, y)

# Printing resuls from olsReg
ols_coef = pd.DataFrame({'var':X.columns, 'val_ols':olsReg.coef_})
ols_coef.sort_values(by='val_ols', ascending=False)


Unnamed: 0,var,val_ols
6,Tech_optimism,85087.393779
0,Individualism,30333.230565
3,Talk_no_action,59.804779
11,Doomism,-36961.98164
1,Free_rider,-43122.997112
2,Whataboutism,-49087.013311
7,Well_being,-51756.707867
4,FF_solutionism,-53579.611611
5,Carrots,-63389.335806
10,Change_impossible,-83616.366184


In [20]:
# (3)
# Trying word regression with frequency input (i.e., normalizing for length)

# Making frequency matrix
wordCounts = regression_df["Word_counts"]

# Formatting data for Lasso regression
lasso_df = regression_df.drop(columns={'Organization_name', 'Document_title', 'Document_type', 'Reference',
                                      'Word_counts', 'cleaned_text', 'Document_year', 'Document_text'})

# Converting intensity to frequency measures
for col in list(lasso_df):
    lasso_df[col] = lasso_df[col] / wordCounts

# Dropping null values
lasso_df.dropna(inplace = True)

# Splitting data into X and Y
y = lasso_df['grant_amount']
X = lasso_df.drop(columns = {'grant_amount'})

# Setting test size to 0.2 means that 80% of my data will be used to train and 20% will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1680)

# Creating variables to store alphas
dictErrs = {}
cv_errs = []
coefs = []

# Running a LASSO regression here
potentialAlphas = np.linspace(1e-6, 1, num = 50).tolist()
for alpha in potentialAlphas:
    lassoReg = Lasso(alpha = alpha, normalize = True)
    lassoReg.fit(X_train, y_train)
    y_pred = lassoReg.predict(X_test)
    cv_err = np.mean((y_pred - y_test)**2)
    cv_errs.append(cv_err)
    coefs.append(lassoReg.coef_)
    dictErrs[cv_err] = alpha

# Running LASSO with optimal alpha
lassoReg = Lasso(alpha = dictErrs[min(cv_errs)], normalize = True) # running LASSO with best alpha
lassoReg = lassoReg.fit(X_train, y_train)

# Printing resuls from LASSOreg
lasso_coef = pd.DataFrame({'var':X.columns, 'val_lasso':lassoReg.coef_})
dict_list = []
for word in X.columns:
    dict_list.append(str(word_to_DoD[word]).replace("{", "").replace("}", "").replace("'", ""))
lasso_coef['associated_dicts'] = dict_list
lasso_coef.sort_values(by='val_lasso', ascending=False)


Unnamed: 0,var,val_lasso,associated_dicts
21,horizon,308112.333189,Tech_optimism
93,commitments,272296.983086,Talk_no_action
11,near future,252857.837777,Tech_optimism
30,lost,72772.432024,Well_being
104,investment,65253.538790,Tech_optimism
...,...,...,...
47,doubt,-147400.357242,Change_impossible
24,impossible,-149637.449906,Doomism
23,disproportionate,-154469.677597,Social_justice
74,promises,-170127.192873,Talk_no_action


In [23]:
regression_df["Word_counts"].sum()

1753466.0

In [24]:
text_df = pd.read_excel("ENVS_documents_for_text_analysis.xlsx")
text_df["Word_counts"].sum()

1753466