In [40]:
import pandas as pd
import subprocess
import tempfile
import os
import json

In [41]:
# Load and process data
csv_file_path = '/Users/main/Desktop/bfro_reports.csv' # update path
data = pd.read_csv(csv_file_path)
relevant_fields = ['State', 'County', 'Location Details', 'Nearest Town', 'Nearest Road', 'Observed', 'Also Noticed']

# Initialize columns for geographic information
data['Location_Name'] = None
data['Latitude'] = None
data['Longitude'] = None

In [42]:
def extract_geo_info_from_file(file_path):
    # Initialize the variables at the start of the function
    geo_info, lat, lng = 'Unknown', 'Unknown', 'Unknown'

    command = [
        'java', '-Xmx4g',
        '-cp', '/opt/homebrew/opt/tika/libexec/tika-app.jar:/opt/homebrew/opt/tika/libexec/tika-parser-nlp-package.jar:/opt/homebrew/opt/tika/libexec/location-ner-model',
        'org.apache.tika.cli.TikaCLI',
        '-m', file_path
    ] # update paths
    
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        lines = result.stdout.splitlines()
        geo_info, lat, lng = 'Unknown', 'Unknown', 'Unknown'
        for line in lines:
            if 'Geographic_NAME' in line:
                geo_info = line.split(':', 1)[1].strip()
            elif 'Geographic_LATITUDE' in line:
                lat = line.split(':', 1)[1].strip()
            elif 'Geographic_LONGITUDE' in line:
                lng = line.split(':', 1)[1].strip()
    except subprocess.CalledProcessError:
        print(f"Error processing file: {file_path}")
    
    return geo_info, lat, lng

In [39]:
# Process each row
for index, row in data.iterrows():
    # Concatenate relevant fields into a single string
    text_content = ' '.join(str(row[field]) for field in relevant_fields if pd.notnull(row[field]))
    
    # Write the concatenated string to a temporary .geot file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.geot', mode='w+') as tmpfile:
        tmpfile.write(text_content)
        tmpfile_path = tmpfile.name
    
    # Extract geographic information from the .geot file
    location_name, latitude, longitude = extract_geo_info_from_file(tmpfile_path)
    
    # Update the DataFrame with extracted information
    data.at[index, 'Location_Name'] = location_name
    data.at[index, 'Latitude'] = latitude
    data.at[index, 'Longitude'] = longitude
    
    print(f"row {index}, Full Output: {location_name, latitude, longitude}")

    # Clean up the temporary file
    os.remove(tmpfile_path)

# Save the updated DataFrame to a new CSV file
data.to_csv('/Users/main/Desktop/assignment2/updated_output.csv', index=False) # update path
print("Completed. Geographic information extracted and added to the dataset.")

row 0, Full Output: ('Unknown', 'Unknown', 'Unknown')
row 1, Full Output: ('Lake Otis Park', '61.20111', '-149.83278')
row 2, Full Output: ('North Carolina', '35.50069', '-80.00032')
row 3, Full Output: ('Alaska', '64.00028', '-150.00028')
row 4, Full Output: ('Unknown', 'Unknown', 'Unknown')
row 5, Full Output: ('Town of Auburn', '42.19724', '-71.84527')
Completed. Geographic information extracted and added to the dataset.
