### Applying my DOD Dictionary

In [1]:
### Preparing my corpus for analysis

# Imports
import pandas as pd
import nltk
import string
import csv

# Adding my filenames
# data_file = 'ENVS_documents_for_text_analysis.xlsx' # Local
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1Zr4SQFxq8u3FnQwRyIHCoQ1Az_lb1PXd45Dhkni7Uok/edit#gid=570879331"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=") # Online

# Reading in data from my spreadsheet
# df = pd.read_excel(data_file, sheet_name='collected_texts', header=0) # Local
df = pd.read_csv(spreadsheet_url, header=0) # Online

# Getting list of all ENVS groups in my database
ENVS_groups = list(df["Organization_name"].unique())

# Collecting and storing each groups' texts from my dataframe
ENVS_text_dict = {}

for group in ENVS_groups:
    group_df = df[df.Organization_name == group] # Filtering by group name
    num_years = list(group_df["Document_year"].unique())
    ENVS_year_text = {}
    for year in num_years:
        group_annual_df = group_df[group_df.Document_year == year] # Filtering by year
        text_list = group_annual_df['Document_text'].tolist()
        text_string = ""
        for item in text_list:
            text_string = str(text_string) + " " + str(item)

        # Spliting long string with all text into list of lowercase words
        envsTextList = text_string.lower().split()

        # Removing stopwords from envsTextList
        someStopwords = set(nltk.corpus.stopwords.words('english'))
        envsTextCleaned = []
        for word in envsTextList:
            if word in {"your", "our", "you", "we", "one's", "too", "not"}: # Saving these words for bigrams
                envsTextCleaned.append(word)
            elif word in someStopwords:
                continue
            else:
                envsTextCleaned.append(word)

        # Removing punctuation
        listNoPunct = []
        for word in envsTextCleaned:
            for mark in set(string.punctuation):
                word=word.replace(mark, ' ')
            listNoPunct.append(word)
        ENVS_year_text[int(year)] = listNoPunct
    ENVS_text_dict[group] = ENVS_year_text


In [2]:
### Creating my Dictionaries

spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

df = pd.read_csv(spreadsheet_url, header=0)

discourse_dict = {}
for row in df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    discourse_dict[delay_method] = dict_words


In [3]:
### Implementing Dictionary Methods on my Corpus

def apply_dict(envs_text: dict, dictionary: list) -> dict:
    """applies given dictionary to envs_text created in cell above"""
    results = {}
    for group in envs_text:
        text_all_years = {}
        text_all_years["Year"] = group
        for year in range(1980, 2021):
            text_all_years[int(year)] = ""
        
        for year in envs_text[group]:
            # Creating results-storing dict, word count, and match count variables
            text_annual = {}
            totalWords = 0
            matches = 0  

            # Looping through every word in my text, counting:
            # 1) the total words and 2) those words that match the words in our dictionary
            last_word = ""
            for word in envs_text[group][year]:
                totalWords +=1
                # Unigrams
                if word in dictionary: 
                    matches += 1
                # Bigrams
                bigram_space = last_word + " " + word
                bigram_no_space = last_word + " " + word
                if bigram_space in dictionary or bigram_no_space in dictionary: 
                    last_word = word
            proportionWords = matches/totalWords
            # Adding results to a dictionary for each group's text
            text_annual["Matches"] = matches
            text_annual["Total Words"] = totalWords
            text_annual["Proportion Words Matched"] = proportionWords
            # Adding each dictionary to temp_results
            # text_all_years[int(year)] = float(text_annual["Matches"])
            text_all_years[int(year)] = float(text_annual["Proportion Words Matched"])
        results[group] = text_all_years
        
    # Computing average prevalence
    for group in results:
        sum = 0
        count = 0
        for year in results[group]:
            if type(results[group][year]) is float:
                sum += float(results[group][year])
                count += 1
        # Controlling for division by 0
        if sum > 0:
            results[group]["Average Prevalence"] = sum / count
        else:
            results[group]["Average Prevalence"] = sum
    
    # Returning results
    return results


In [8]:
### Importing robustness data

matches_data = "/Users/finn/Documents/GitHub/environmental_text_analysis/MA_CT_opposition_testimony/testimony_data_matches.csv"
prop_match_data = "/Users/finn/Documents/GitHub/environmental_text_analysis/MA_CT_opposition_testimony/testimony_data_prop_match.csv"
data_file = matches_data # Change from matches to prop_match

testimony_df = pd.read_csv(data_file) # Local
all_dods = testimony_df.to_dict('records')

# Formatting nested dictionaries
testimony_dict = {}
for entry in all_dods:
    delay = entry["Year"]
    testimony_dict[delay] = {'Year': "Testimony"}
    entry.pop("Year")
    entry_int_keys = {}
    for year in entry:
        entry_int_keys[int(year)] = entry[year]
    testimony_dict[delay].update(entry_int_keys)
    testimony_dict[delay]['Average Prevalence'] = ""


In [39]:
### Saving results to XLSX
all_prevalence_list = []
list_delay_type = []
for delay in discourse_dict:
    list_delay_type.append(delay)
    results_list = []
    function_output = apply_dict(ENVS_text_dict, discourse_dict[delay])

    # Creating list of dictionaries
    group_list = []
    prevalence_list = []
    year_list = []
    # Looping through every group in function_output
    for group in function_output:
        results_list.append(function_output[group])
        
        year = 1980
        while year <= 2020:
            year_list.append(year)
            group_list.append(group)
            prevalence_list.append(function_output[group][year])
            year += 1
    all_prevalence_list.append(prevalence_list)

# Concatenating data into list of lists
all_prevalence_list.append(year_list)
all_prevalence_list.append(group_list)

# Making dataframe from list of lists
results_df = pd.DataFrame(all_prevalence_list)

# Transposing dataframe and renaming columns
results_df = results_df.T.rename(columns={0: 'Individualism', 1: "The 'free rider' excuse", 
                                         2: 'Whataboutism', 3: 'All talk, little action',
                                         4: 'Fossil fuel solutionism', 5: 'No sticks, just carrots',
                                         6: 'Technological optimism', 7: 'Appeal to well-being',
                                         8: 'Policy perfectionism', 9: 'Appeal to social justice',
                                         10: 'Change is impossible', 11: 'Doomism', 12: 'Year', 13: 'Group'})

# Dropping NaN group names
results_df = results_df[results_df["Group"].notna()]
results_df = results_df.replace('', 999)

# Moving Group column to front of dataframe
group = results_df['Group']
results_df.drop(labels=['Group'], axis=1, inplace = True)
results_df.insert(0, 'Group', group)

# Saving dataframe as XLSX spreadsheet
results_df.to_excel("DoD_results.xlsx", index=False)


In [None]:
### Old code to format each DoD on its own shet

#     keys = results_csv[0].keys()
#     path = "/Users/finnianlowden/Documents/GitHub/environmental_text_analysis/Dictionary_methods/*dod_results*/"
#     file_name = path + delay + ".csv"
    
#     with open(file_name, "w") as my_file:
#         dict_writer = csv.DictWriter(my_file, keys)
#         dict_writer.writeheader()
#         dict_writer.writerows(results_csv)

#     # Loading CSV
#     results_df = pd.read_csv(file_name, header=None).drop(11)
#     results_df.drop([42], axis=1, inplace = True)
    
# results_df.head(50)

# # Combining all CSVs into one sheet
# # writer = pd.ExcelWriter('dod_matches.xlsx', engine='xlsxwriter')
# writer = pd.ExcelWriter('dod_prop_match.xlsx', engine='xlsxwriter')
# for delay in discourse_dict:
#     path = "/Users/finnianlowden/Documents/GitHub/environmental_text_analysis/Dictionary_methods/*dod_results*/"
#     file_name = path + delay + ".csv"
#     current_df = pd.read_csv(file_name, header=None)
    
#     current_df.to_excel(writer, sheet_name=delay, header=False, index=False) 
# writer.save()