In [1]:
from nltk.tokenize import sent_tokenize
import re
import statistics
import os
import glob
import csv
from google.colab import drive
from google.colab import auth

In [2]:
auth.authenticate_user()

!fusermount -u /content/drive  # Unmount first
drive.mount('/content/drive', force_remount=True)

# Check if the mount was successful
if os.path.exists('/content/drive/MyDrive'):
  print("Google Drive mounted successfully!")
else:
  print("Failed to mount Google Drive. Please check authentication and network.")

fusermount: failed to unmount /content/drive: No such file or directory
Mounted at /content/drive
Google Drive mounted successfully!


The cell below is a sanity check to ensure we are able to access the folder/directory where our corpus input and output folders are. If you put in the path to the folder correctly, you should see the files in your folder.

In [5]:
drive_path = '/content/drive/My Drive/Where In The World'

# Check if the "Shared with me" directory exists
if os.path.exists(drive_path):
    print("Listing directories in google drive path:")
    print(os.listdir(drive_path))

Listing directories in google drive path:
['Segmentation', 'HuggingFace NER', 'Topic modeling Parti Pris', 'Summer 2024 Version - QuebecNationalLibraryScraper.ipynb', 'metadata', 'corpus', 'Em NER', 'Magazines.gsheet', "Em's research notes.gdoc"]


Make sure to change the directory_path and output_directory_path to the correct path on your computer. Check using the above cell that you are accessing the correct input/output folder, especially the output folder

In [6]:
directory_path = '/content/drive/My Drive/Where In The World/Segmentation/Additional files (temp)/'
output_directory_path = '/content/drive/My Drive/Where In The World/Segmentation OUTPUT/'

# CREATE OUTPUT DIRECTORY IF IT DOESN'T EXIST
if not os.path.exists(output_directory_path):
  os.makedirs(output_directory_path)

# CHECK INPUT PATH IS VALID
if os.path.exists(directory_path):
  print(f"Directory exists: {directory_path}")
  print("Contents of the directory:")
  print(os.listdir(directory_path))
else:
  print(f"Directory does not exist: {directory_path}")

Directory exists: /content/drive/My Drive/Where In The World/Segmentation/Additional files (temp)/
Contents of the directory:
['2224873.txt', '2224874.txt', '2224875.txt', '2224876.txt', '2224877.txt', '2224878.txt', '2224879.txt', '2224880.txt', '2225521.txt', '2225522.txt', '2225523.txt', '2225524.txt', '2225525.txt', '2225526.txt', '2225527.txt', '2225528.txt', '2225529.txt', '2225530.txt', '2225531.txt', '2225532.txt', '2225533.txt', '2225534.txt', '2225535.txt', '2225536.txt', '2225537.txt', '2225539.txt', '2225540.txt', '2225541.txt', '2225542.txt', '2225544.txt', '2225545.txt', '2225546.txt', '2225547.txt', '2225548.txt', '2225549.txt', '2225550.txt', '2225551.txt', '2225552.txt', '2225553.txt', '2225556.txt', '2225557.txt', '2225558.txt', '2225559.txt', '2225560.txt', '2225561.txt', '2225562.txt', '2225924.txt', '2225925.txt', '2225926.txt', '2225927.txt', '2225928.txt', '2225929.txt', '2225930.txt', '2225931.txt', '2225932.txt', '2225933.txt', '2225934.txt', '2225935.txt', '22

In [7]:
# CLEAN PARAGRAPHS
def clean_paragraph(paragraph):
  paragraph = paragraph.strip()

  # Remove paragraphs with special characters not in valid punctuation
  if re.search(r'[^\w\s.,!?\'"-<>]', paragraph):
    return None

  # remove paras with multiple consecutive literal spaces
  if re.search(r'\s{2,}', paragraph):
    return None

  return paragraph

In [8]:
# MERGE INCORRECTLY SPLIT PARAGRAPHS
def merge_paragraphs(paragraphs):
  merged_paragraphs = []
  buffer = ""

  for i, paragraph in enumerate(paragraphs):
    paragraph = paragraph.strip()

    # check if we should merge the current paragraph with the buffer
    if buffer and not buffer.endswith(('.', '?', '!', '"', "'", '”', '’')) and paragraph and paragraph[0].islower():
      buffer += " " + paragraph
    else:
      if buffer:
        merged_paragraphs.append(buffer)
      buffer = paragraph

  # append the last buffered paragraph if exists
  if buffer:
    merged_paragraphs.append(buffer)

  return merged_paragraphs

In [9]:
# GET LIST OF FILES IN DIRECTORY
file_list = glob.glob(os.path.join(directory_path, '*.txt'))

for file_path in file_list:
  with open(file_path, 'r') as f:
    text = f.read()

    # SPLIT INTO PARAGRAPHS
    paragraphs = text.split('\n')

    # MERGE INCORRECTLY SPLIT PARAGRAPHS
    merged_paragraphs = merge_paragraphs(paragraphs)

    # CLEAN THE PARAGRAPHS
    cleaned_paragraphs = [clean_paragraph(paragraph) for paragraph in merged_paragraphs]
    cleaned_paragraphs = [paragraph for paragraph in cleaned_paragraphs if paragraph is not None]

    # FILTER FOR LENGTH > MEAN
    paragraph_lengths = [len(paragraph) for paragraph in cleaned_paragraphs]
    mean_paragraph_length = statistics.mean(paragraph_lengths)
    filtered_paragraphs_mean = [paragraph for paragraph in cleaned_paragraphs if len(paragraph) >= mean_paragraph_length]

    # SAVE OUTPUT TO CSV FILE
    file_name = os.path.basename(file_path).replace('.txt', '.csv')
    output_file_path = os.path.join(output_directory_path, file_name)

    with open(output_file_path, 'w', newline='') as csvfile:
      writer = csv.writer(csvfile)
      for paragraph in filtered_paragraphs_mean:
        writer.writerow([paragraph])

    print(f"Filtered content saved to: {output_file_path}")

Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224873.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224874.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224875.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224876.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224877.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224878.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224879.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2224880.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation OUTPUT/2225521.csv
Filtered content saved to: /content/drive/My Drive/Where In The World/Segmentation