The code below install Flair which is the huggingface library that you need to run this model. Here is the link to the model: https://huggingface.co/flair/ner-french

In [1]:
pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.35.10-py3-none-any.whl.metadata (6.6 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting deprecated>=1.2.13 (from flair)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.2.3-py3-none-any.whl.metadata (7.8 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Coll

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger
from google.colab import drive
import pandas as pd
import os
import os
import csv
import re

In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
input_directory = '/content/drive/My Drive/Where In The World/Segmentation/Segmentation OUTPUT/'
output_directory = '/content/drive/My Drive/Where In The World/NER/'


# CHECK INPUT PATH IS VALID
if os.path.exists(input_directory):
  print(f"Input directory exists: {input_directory}")
  print("Contents of the directory:")
  print(os.listdir(input_directory))
else:
  print(f"Input directory does not exist: {input_directory}")

# CHECK OUTPUT PATH IS VALID
if os.path.exists(output_directory):
  print(f"Output directory exists: {output_directory}")
  print("Contents of the directory:")
  print(os.listdir(output_directory))
else:
  print(f"Output directory does not exist: {output_directory}")


doc_directory_names = [
    'Batch_1',
    'Batch_2',
    'Batch_3',
    'Batch_4',
    'Batch_5',
    'Batch_6',
    'Batch_7',
    'Batch_8',
    'Batch_9',
    'Batch_10',
    'Batch_11',
    'Batch_12',
]

Input directory exists: /content/drive/My Drive/Where In The World/Segmentation/Segmentation OUTPUT/
Contents of the directory:
['Batch_1', 'Batch_2', 'Batch_3', 'Batch_4', 'Batch_5', 'Batch_6', 'Batch_7', 'Batch_8', 'Batch_9', 'Batch_10', 'Batch_11', 'Batch_12']
Output directory exists: /content/drive/My Drive/Where In The World/NER/
Contents of the directory:
['trial 1 - Flair, confidence threshold 0.7', 'trial 2 - Strava', 'Flair model || WITW NER.ipynb', 'location_CSV', 'all_ner', 'location_TXT']


In [7]:
# chat wrote the parse_string to fit the format of the NER output into a csv

def parse_string(text):
  regex = r'(Span\[\d+:\d+\]:) "([^"]+)" → ([A-Z]+) \(([\d.]+)\)'
  match = re.match(regex, text)
  if match:
      return [match.group(1), match.group(2), match.group(3), match.group(4)]
  else:
      return []

In [8]:
tagger = SequenceTagger.load("flair/ner-french")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

2024-08-30 22:46:09,704 SequenceTagger predicts: Dictionary with 19 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC, S-ORG, B-ORG, E-ORG, I-ORG, <START>, <STOP>


In [10]:
extraction_directory = input_directory + doc_directory_names[0]
print("We are working on:")
print(extraction_directory)

for csvfile in os.scandir(extraction_directory):
  text_segments = []
  all_ner = []
  locations = []
  curr_document = csvfile.name


  # get the segments
  if csvfile.is_file() and csvfile.name.endswith('.csv'):
    with open(csvfile.path, 'r', newline='', encoding='utf-8') as f:
      reader = csv.reader(f)
      for row in reader:
        # Assuming each row is a single segment/paragraph
        if row:  # Check if the row is not empty
          doc = str(row[0])  # Convert the first column to string
          text_segments.append(doc)


  #this code below runs the NER model on text segments

  for segment in text_segments:
    sentence = Sentence(segment)
    tagger.predict(sentence)
    for entity in sentence.get_spans('ner'):
      if entity.score > 0.8 and entity.tag == 'LOC':   # this code below saves the locations with high confidence score
        locations.append(entity)
      else:
        all_ner.append(entity)   # this code below saves the whole output into a textfile


  with open(output_directory + '/all_ner' + '/' + curr_document + '_allner' + '.txt', 'w') as file:
    for ner in all_ner:
        file.write(str(ner) + '\n')

  # this code below saves the locations into a txt
  with open(output_directory + '/location_TXT' + '/' + curr_document + '_location' + '.txt', 'w') as file:
    for location in locations:
        file.write(str(location) + '\n')

  # this code below writes the locations into a csv after organizing them

  rows = [parse_string(str(item)) for item in locations]

  with open(output_directory + '/location_CSV' + '/' + curr_document + '_location' + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in rows:
        csv_writer.writerow(row)

print("Done!")

We are working on:
/content/drive/My Drive/Where In The World/Segmentation/Segmentation OUTPUT/Batch_1
Done!


In [None]:
# we made one massive csv of unique placenames to counts.
# further processing in sheets

# Step 1: Get list of csv files in the directory
folder_path = output_directory + '/location_CSV'

# A dictionary to store unique strings and their counts
unique_strings = {}

# Step 2: Read each CSV and process its second column
for csvfile in os.scandir(folder_path):
    file_path = os.path.join(folder_path, csvfile)
    # print(file_path)

    try:
      # Attempt to read the CSV file
      df = pd.read_csv(file_path)
    except pd.errors.EmptyDataError:
      # If the file is empty, skip it
      print(f"Skipped empty file: {file_path}")
      continue

    # Get the second column's data
    second_column = df.iloc[:, 1]

    # Step 3: Normalize the strings and filter
    normalized_strings = second_column.str.title().str.strip()  # Convert to title case and strip spaces
    filtered_strings = [string for string in normalized_strings if len(string) > 1]

    # Step 4: Count the occurrences of each string
    for string in filtered_strings:
        if string in unique_strings:
            unique_strings[string] += 1
        else:
            unique_strings[string] = 1

# Step 5: Write the results to a new CSV
result_df = pd.DataFrame(list(unique_strings.items()), columns=['Placename', 'Count'])
output_path = os.path.join(output_directory, 'unique_place_counts.csv')
result_df.to_csv(output_path, index=False)

print("Process completed!")

print(f"CSV file saved in: {output_path}")


Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225501.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225505.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225474.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225476.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225492.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225456.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/2225498.csv_location.csv
Skipped empty file: /content/drive/My Drive/Where In The World/NER//location_CSV/163122_2-1966-06.csv_location.csv
Process completed!
CSV file saved in: /content/drive/My Drive/Where In The World/NER/unique_place_counts.csv
