### Converting rows in a XLSX file to individual txt files

In [5]:
### Importing data from spreadsheet into a text file

# Imports
import pandas as pd
import nltk
import string

# Adding my filenames
data_file = 'ENVS_documents_for_text_analysis.xlsx'

# Reading in data from my spreadsheet
df = pd.read_excel(data_file, sheet_name='collected_texts', header=0)

# Getting list of all ENVS groups in my database
ENVS_groups = list(df["Organization_name"].unique())

In [19]:
### Preparing text files

# Filtering by document type to make data more homogenous
preferred_doc_types = ['Formal Letter', 'Future Goals', 'Project Description', 'Executive Summary'] 
# Options: Formal Letter, Autobiographical Description, Corporate Partner Description, Future Goals,
#    Project Description, Executive Summary

# Saving each row of my dataframe as a new txt file
for group in ENVS_groups:
    # Filtering dataframe by group
    df_filtered = df[df.Organization_name == group] # Filtering by group name
    count = 0
    for index, row in df_filtered.iterrows():
        count += 1
        if row['Organization_name'] in set(ENVS_groups) and row['Document_type'] in set(preferred_doc_types):
            # Getting text stored in each row
            current_text = row["Document_text"]

            # Spliting long string with all text into list of lowercase words
            envsTextList = str(current_text).lower().split()

            # Unedited texts for reading
            full_text_string = ""
            for item in envsTextList:
                full_text_string = str(full_text_string) + " " + str(item)

            # Making text files for reading
            # Writing each row's cleaned text to a new txt file
            path = '/Users/finnianlowden/Dropbox/Brown_2021-2022/SOC 2961M/Final Project/txt_files_reading/'
            with open((path + group + str(count) + ".txt"), "w") as f:
                f.write(full_text_string)
                    
                    
            # Making text files for topic modeling
            # Removing punctuation
            listNoPunct = []
            punct_string = string.punctuation
            punctuation = [char for char in punct_string]
            for word in envsTextList:
                for mark in punctuation:
                    word=word.replace(mark, '')
                listNoPunct.append(word)
            
            topic_text_string = ""
            for item in listNoPunct:
                topic_text_string = str(topic_text_string) + " " + str(item)
            
            # Removing organization names and abbreviations from envsTextList
            someOrgNames = {"national fish and wildlife foundation", "national fish and wildlife foundation's",
                "nfwf", "nfwf's", "the nature conservancy", "the nature conservancy's", "tnc", "tnc's",
                "conservation international", "conservation international's", "ci", "ci's",
                "american forests", "american forests'", "af's", "afs",
                "world wildlife fund", "world wildlife fund's", "wwf", "wwf's",
                "audubon society", "audubon society's",
                "sierra club", "sierra club's", "tsc", "tscs",
                "ocean conservancy", "ocean conservancy's",
                "natural resources defense council", "natural resources defense council's",
                "nrdc", "nrdc's", "environmental defense fund", "environmental defense fund's",
                "edf", "edf's"}
            
            for org in someOrgNames:
                topic_text_string = topic_text_string.replace(org, '') 
            
            # Writing each row's cleaned text to a new txt file
            path = '/Users/finnianlowden/Dropbox/Brown_2021-2022/SOC 2961M/Final Project/txt_files_topic_modeling/'
            with open((path + group + str(count) + ".txt"), "w") as f:
                f.write(topic_text_string)
