### Applying my DOD Dictionary

In [91]:
### Preparing my corpus for analysis

# Imports
import pandas as pd
import nltk
import string
import csv

# Adding my filenames
# data_file = 'ENVS_documents_for_text_analysis.xlsx' # Local
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1Zr4SQFxq8u3FnQwRyIHCoQ1Az_lb1PXd45Dhkni7Uok/edit#gid=570879331"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=") # Online

# Reading in data from my spreadsheet
# df = pd.read_excel(data_file, sheet_name='collected_texts', header=0) # Local
df = pd.read_csv(spreadsheet_url, header=0) # Online

# Getting list of all ENVS groups in my database
ENVS_groups = list(df["Organization_name"].unique())

# Collecting and storing each groups' texts from my dataframe
ENVS_text_dict = {}

for group in ENVS_groups:
    group_df = df[df.Organization_name == group] # Filtering by group name
    num_years = list(group_df["Document_year"].unique())
    ENVS_year_text = {}
    for year in num_years:
        group_annual_df = group_df[group_df.Document_year == year] # Filtering by year
        text_list = group_annual_df['Document_text'].tolist()
        text_string = ""
        for item in text_list:
            text_string = str(text_string) + " " + str(item)

        # Spliting long string with all text into list of lowercase words
        envsTextList = text_string.lower().split()

        # Removing stopwords from envsTextList
        someStopwords = set(nltk.corpus.stopwords.words('english'))
        envsTextCleaned = []
        for word in envsTextList:
            if word in someStopwords:
                continue
            else:
                envsTextCleaned.append(word)

        # Removing punctuation
        listNoPunct = []
        for word in envsTextCleaned:
            for mark in set(string.punctuation):
                word=word.replace(mark, '')
            listNoPunct.append(word)
        ENVS_year_text[int(year)] = listNoPunct
    ENVS_text_dict[group] = ENVS_year_text


In [92]:
### Creating my Dictionaries

spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

df = pd.read_csv(spreadsheet_url, header=0)

discourse_dict = {}
for row in df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    discourse_dict[delay_method] = dict_words


In [93]:
### Implementing Dictionary Methods on my Corpus

def apply_dict(envs_text: dict, dictionary: list) -> dict:
    """applies given dictionary to envs_text created in cell above"""
    results = {}
    for group in envs_text:
        text_all_years = {}
        text_all_years["Year"] = group
        for year in range(1980, 2020):
            text_all_years[int(year)] = "-"
        
        for year in envs_text[group]:
            # Creating results-storing dict, word count, and match count variables
            text_annual = {}
            totalWords = 0
            matches = 0  

            # Looping through every word in my text, counting:
            # 1) the total words and 2) those words that match the words in our dictionary
            for word in envs_text[group][year]:
                totalWords +=1
                if word in dictionary:
                    matches += 1
            proportionWords = matches/totalWords
            # Adding results to a dictionary for each group's text
            text_annual["Matches"] = matches
            text_annual["Total Words"] = totalWords
            text_annual["Proportion Words Matched"] = proportionWords
            # Adding each dictionary to temp_results
            text_all_years[year] = text_annual["Proportion Words Matched"] # Delete ["Proportion Words Matched"] if I want all data
        results[group] = text_all_years
    return results

# Saving results to XLSX
for delay in discourse_dict:
    results_csv = []
    function_output = apply_dict(ENVS_text_dict, discourse_dict[delay])
    
    # Creating list of dictionaries
    for group in function_output:
        results_csv.append(function_output[group])
        
    keys = results_csv[0].keys()
    path = "/Users/finnianlowden/Dropbox/Brown_2021-2022/Thesis/Text Analysis/*dod_results*/"
    file_name = path + delay + ".csv"
    
    with open(file_name, "w") as my_file:
        dict_writer = csv.DictWriter(my_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(results_csv)

    # Transposing CSV output
    pd.read_csv(file_name, header=None).T.to_csv(file_name, header=False, index=False)
    
# Combining all CSVs into one sheet
writer = pd.ExcelWriter('dod_results.xlsx', engine='xlsxwriter')
for delay in discourse_dict:
    path = "/Users/finnianlowden/Dropbox/Brown_2021-2022/Thesis/Text Analysis/*dod_results*/"
    file_name = path + delay + ".csv"
    current_df = pd.read_csv(file_name, header=None)
    current_df.to_excel(writer, sheet_name=delay) 
writer.save()


In [85]:
# ### Finding average prevalence for each group

# # For each group's combined texts, I create a dictionary to store the data I get from each comparison
# def find_sum_group_prevalence(data: dict, dictionary: list) -> dict:
#     """applies given dictionary to data created in cell above"""
#     temp_results = {}
#     for group in data:
#         # Creating results-storing dict, word count, and match count variables
#         textDict = {}
#         totalWords = 0
#         matches = 0

#         # Looping through every word in my text, counting:
#         # 1) the total words and 2) those words that match the words in our dictionary
#         for word in data[group]:
#             totalWords +=1
#             if word in dictionary:
#                 matches += 1
#         proportionWords = matches/totalWords
#         # Adding results to a dictionary for each group's text
#         textDict["Matches"] = matches
#         textDict["Total Words"] = totalWords
#         textDict["Proportion Words Matched"] = proportionWords
#         # Adding each dictionary to temp_results
#         temp_results[group] = textDict
#     return temp_results

# # Finding total delay prevalence by group
# prevalence_dict = {}
# for delay in discourse_dict:
#     current_dict = find_sum_group_prevalence(ENVS_text_dict, discourse_dict[delay])
#     for group in ENVS_groups:
#         if group not in prevalence_dict:
#             prevalence_dict[group] = current_dict[group]["Proportion Words Matched"]
#         else:
#             prevalence_dict[group] += current_dict[group]["Proportion Words Matched"]
            
# # Finding average prevelance by group
# for key in prevalence_dict:
#     prevalence_dict[key] = prevalence_dict[key] / len(discourse_dict)

# for key in prevalence_dict:
#     print(key, prevalence_dict[key])
