### Word Frequecy in Discourses of Delay texts

In [6]:
### Importing all of my modules and data

# Imports
import pandas as pd
import nltk
import string
import csv
import os

# Path to my txt files
data_folder = "/Users/finnianlowden/Documents/GitHub/environmental_text_analysis/MA_CT_opposition_testimony/Testimony_as_txts"

# Opening each txt file in folder
testimony = {"MA": {}, "CT": {}}
all_texts = []

for file in os.listdir(data_folder):
    file_name_as_list = file.split("_")
    state = file_name_as_list[0]
    try:
        year = int(file_name_as_list[1])
    except:
        continue
    
    year = int(file_name_as_list[1])
    
    with open(os.path.join(data_folder, file), 'r', encoding = "ISO-8859-1") as f:
        text = f.read()
        if year not in testimony[state]:
            testimony[state][year] = [text]
        else:
            testimony[state][year].append(text)


In [7]:
### Importing my Dictionaries

spreadsheet_url = "https://docs.google.com/spreadsheets/d/1MhB60vzde7KT9Ti6eQtimmWvYAEersI4zK3L_gwDNA8/edit#gid=0"
spreadsheet_url = spreadsheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

df = pd.read_csv(spreadsheet_url, header=0)

discourse_dict = {}
for row in df.iterrows():
    delay_method = row[1]["Sub-category"]
    dict_words = row[1]["Current_dict"].split(", ")
    discourse_dict[delay_method] = dict_words


In [8]:
### Processing my imported data into a dictionary

# Cleaning each example in delay_examples
cleaned_testimony = {"MA": {}, "CT": {}}
num_testimony = 0
for state in testimony:
    for year in testimony[state]:
        for item in testimony[state][year]:
            num_testimony += 1
            
            # Removing line breaks
            item = item.replace(" \n", "")
            
            # Spliting long string with all text into list of lowercase words
            item = item.lower().split()

            # Removing stopwords from example
            someStopwords = set(nltk.corpus.stopwords.words('english'))
            itemCleaned = []
            for word in item:
                if word in {"your", "our", "you", "we", "one's", "too", "not"}: # Saving these words for bigrams
                    itemCleaned.append(word)
                elif word in someStopwords:
                    continue
                else:
                    itemCleaned.append(word)

            # Removing punctuation
            itemNoPunct = []
            for word in itemCleaned:
                for mark in set(string.punctuation):
                    word = word.replace(mark, '')
                itemNoPunct.append(word)

            # Appending cleaned data as list to delay_examples
            if year not in cleaned_testimony[state]:
                cleaned_testimony[state][year] = [itemNoPunct]
            else:
                cleaned_testimony[state][year].append(itemNoPunct)


In [9]:
### Implementing Dictionary Methods on my Corpus

# Helper function to calculate results
def calculate_results(results_dict: dict):
    """prints the results of apply_dict"""
    total_results = {}
    
    # Adding all years in range to total_results
    for year in range(1980, 2021):
            total_results[year] = {"total_matches": "", "total_words": "", "Proportion Words Matched": ""}
    
    # Adding matches and total words to total_results
    for testimony in results_dict:
        year = int(testimony.split("_")[1])
        if total_results[year]["total_matches"] == "":
            total_results[year]["total_matches"] = results_dict[testimony]["Matches"]
        if total_results[year]["total_words"] == "":
            total_results[year]["total_words"] = results_dict[testimony]["Total Words"]
        total_results[year]["total_matches"] += results_dict[testimony]["Matches"]
        total_results[year]["total_words"] += results_dict[testimony]["Total Words"]
    
    # Calculating prop_match
    for year in total_results:
        if type(total_results[year]["total_words"]) is not str:
            if total_results[year]["total_words"] == 0:
                prop_match = ""
            else:
                prop_match = total_results[year]["total_matches"] / total_results[year]["total_words"]
            total_results[year]["Proportion Words Matched"] = prop_match
    return total_results

# For each group's combined texts, I create a dictionary to store the data I get from each comparison
def apply_dict(testimony_dict: dict, DOD_dict: set) -> dict:
    """applies given dictionary to data cleaned in cell above"""
    results_dict = {}
    count = 0
    for state in testimony_dict:
        for year in testimony_dict[state]:
            for item in testimony_dict[state][year]:
                # Creating results-storing dict, word count, and match count variables
                temp_results = {}
                totalWords = 0
                matches = 0
                count += 1

                # Looping through every word in my text, counting:
                # 1) the total words and 2) those words that match the words in our dictionary   
                last_word = ""
                for word in item:
                    if len(word) <= 1: # Filtering out short words
                        continue
                    else:
                        totalWords +=1
                        # Unigrams
                        if word in DOD_dict: 
                            matches += 1
                        # Bigrams
                        bigram_space = last_word + " " + word
                        bigram_no_space = last_word + " " + word
                        if bigram_space in DOD_dict or bigram_no_space in DOD_dict: 
                            matches += 1
                        last_word = word
                    proportionWords = matches/totalWords
                # Adding results to a dictionary for each group's text
                temp_results["Matches"] = matches
                temp_results["Total Words"] = totalWords
                temp_results["Proportion Words Matched"] = proportionWords
                # Adding each type of robustness_data to temp_results
                results_dict[state + "_" + str(year) + "_" + str(count)] = temp_results
    return calculate_results(results_dict)


In [10]:
# Compiling results
results = {} # All data
csv_results = {} # Data needed for csv

# Adding all years to csv_results
temp_results = {}
for year in range(1980, 2021):
    temp_results[year] = ""

for delay in discourse_dict:
    results[delay] = apply_dict(cleaned_testimony, set(discourse_dict[delay]))
    csv_results[delay] = temp_results.copy()
    sum_data = 0
    count = 0
    for year in csv_results[delay]:
        # Change from average matches to prop_match
        # Average values
#         data = results[delay][year]["Proportion Words Matched"]
#         if type(data) is not str:
#             count += 1
#             sum_data += data
#     for year in csv_results[delay]:
#         data = results[delay][year]["Proportion Words Matched"]
#         if type(data) is not str:
#             csv_results[delay][int(year)] = sum_data / count
        # Average values
        # Aggregate values
        prop_match = results[delay][year]["total_matches"] # total_matches, total_words, or Proportion Words Matched
        csv_results[delay][int(year)] = prop_match

# Creating list of dictionaries for DictWriter
output_list = []
for delay in csv_results:
    csv_results[delay]["Year"] = delay
    output_list.append(csv_results[delay])

keys = csv_results[delay].keys()

# Change from prop_match to average matches
# with open("testimony_data_prop_match.csv", "w") as my_file:
with open("testimony_data_matches.csv", "w") as my_file:
    dict_writer = csv.DictWriter(my_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(output_list)