In [None]:
###  This code runs the light processing routines from American Stories on AS-format article text.
###  The file is loaded and saved locally.

In [None]:
# Installs

!pip install symspellpy

In [None]:
#Imports
import io
import os
import json
import pandas as pd
from tqdm import tqdm
import tqdm as tq

In [None]:
# let's initialize the package
import pkg_resources
from symspellpy import SymSpell, Verbosity
import string

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
en_dict = pkg_resources.resource_filename('symspellpy', 'frequency_dictionary_en_82_765.txt')
sym_spell.load_dictionary(en_dict, term_index=0, count_index=1)

In [None]:
# This is code from AS and has functions for text correction and polishing.
# we now create a few functions that take care of the issues flagged above

# thsese two functions implement spelling corrections
def check_word(word):
  no_punc_word = word.strip(string.punctuation)
  if len(no_punc_word) > 0:
    suggestions = sym_spell.lookup(no_punc_word, Verbosity.CLOSEST, max_edit_distance=1, include_unknown=True, transfer_casing=True)
  else:
    return word
  return word.replace(no_punc_word, suggestions[0].term)

def spell_check(text):
  lines = text.split('\n')
  checked_lines = []
  for line in lines:
    words = line.split(' ')
    checked_line = ' '.join([check_word(word) for word in words])
    checked_lines.append(checked_line)
  return '\n'.join(checked_lines)

# this function checks capitalization
def capitalization_check(text):
  lines = text.split('\n')
  checked_lines = []
  for line in lines:
    words = line.split(' ')
    for i in range(1, len(words)):
      if words[i-1][-1] in ['.', '!', '?']:
        words[i] = words[i].capitalize()
      else:
        no_punc_word = words[i].strip(string.punctuation)
        if no_punc_word in sym_spell.words and no_punc_word not in ['i', "i'll"]: # Check that the word is not a propper noun
          words[i] = words[i].replace(no_punc_word, no_punc_word.lower())

    checked_lines.append(' '.join(words))
  return '\n'.join(checked_lines)

# this functions corrects line breaks
def line_merge(text):
  lines = [l.split() for l in text.split('\n')]
  for i in range(len(lines) - 1):
    if len(lines[i]) == 0 or len(lines[i+1]) == 0:
      continue
    elif lines[i][-1][-1] == '-': # Automatically merge if a line ends with a hyphen
      lines[i][-1] = lines[i][-1][:-1] + lines[i+1][0]
      lines[i+1] = lines[i+1][1:]
    elif lines[i][-1].strip(string.punctuation).lower() not in sym_spell.words or lines[i+1][0].strip(string.punctuation).lower() not in sym_spell.words:
      if (lines[i][-1].strip(string.punctuation).lower() + lines[i+1][0].strip(string.punctuation).lower()) in sym_spell.words:
        lines[i][-1] += lines[i+1][0]
        lines[i+1] = lines[i+1][1:]

  return '\n'.join([' '.join(l) for l in lines])


In [None]:
# this functions implements all three methods
def postprocess(text):
  merged = line_merge(text)
  checked = spell_check(merged)
  capitalization_normalized = capitalization_check(checked)
  return capitalization_normalized

In [None]:
### Loading csv file into memory

import os
import pandas as pd

# Get the current working directory
current_directory = os.getcwd()

# File name
file_name = "AS_Explor_Prox_Concat_Grouped.csv"

# File path
file_path = os.path.join(current_directory, file_name)

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    kw_hit_grouped_df = pd.read_csv(file_path)
    print("File loaded successfully.")
    # Now you can work with the DataFrame 'df'
else:
    print(f"Error: File '{file_name}' not found in the current directory.")


In [None]:
# Applies postprocess function to kw_hit_grouped_df

# Define a function to track progress
def track_progress(iterable, prefix='', suffix='', decimals=1, length=100, fill='█'):
    total = len(iterable)
    def print_progress(iteration):
        percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
        filled_length = int(length * iteration // total)
        bar = fill * filled_length + '-' * (length - filled_length)
        print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r')
        if iteration == total:
            print()
    return print_progress

# Define a function to apply postprocess to each element in the 'article' column
def apply_postprocess(article):
    # Track progress
    apply_postprocess.counter += 1
    progress_tracker(apply_postprocess.counter)
    # Apply postprocess function to the element
    return postprocess(article)

# Initialize a counter
apply_postprocess.counter = 0

# Create a progress tracker
progress_tracker = track_progress(range(len(kw_hit_grouped_df['article'])), prefix='Progress:', suffix='Complete', length=50)

# Apply the function to the 'article' column
kw_hit_grouped_df['article'] = kw_hit_grouped_df['article'].apply(apply_postprocess)

print ("Finished!  Don't forget to save dataframe to csv!  It's in the next cell.")


In [None]:
# Save the DataFrame to a CSV file in the local working directory
kw_hit_grouped_df.to_csv('kw_grouped_postprocess.csv', index=False)


In [None]:
print(kw_hit_grouped_df)