In [5]:
import pandas as pd
people = pd.read_csv('data/People.csv')

In [10]:
import csv
import random

def get_sample(file_path, n_samples=1000, seed=42):
    """Get a random sample of the data."""
    random.seed(seed)
    sampled_rows = []

    try:
        with open(file_path, 'r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)

            # Fill the reservoir with initial rows
            for _ in range(n_samples):
                try:
                    row = next(reader)
                    sampled_rows.append(row)
                except StopIteration:
                    print(f"File has fewer rows than requested samples ({len(sampled_rows)} rows).")
                    return sampled_rows

            # Reservoir sampling for remaining rows
            for row_num, row in enumerate(reader, start=n_samples):
                random_pick = random.randint(1, row_num)
                if random_pick < n_samples:
                    sampled_rows[random_pick] = row
                    
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
    except Exception as e:
        print(f"Error while sampling data: {e}")
    return sampled_rows

In [11]:
people_sampled = get_sample('data/People.csv', 100)

In [15]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [25]:
import pandas as pd
from fuzzywuzzy import process

# Example list of valid states
valid_states = ["IL", "IN", "MI", "TX", "CO", "GA", "FL", "NJ", "IA", "MO", "NY", 
                "MA", "CA", "AR", "MN", "WI", "WA", "LA", "MD", "ME", "OH", "UT", 
                "NC", "NV", "TN", "DE", "VA", "AZ", "MS", "KS", "PA", "SC", "ID", 
                "WY", "KY", "DC", "OK", "NM", "AL", "MT", "HI", "OR", "RI", "ND", 
                "WV", "NH", "CT", "SD", "AK", "VT"]



def get_best_match(state, valid_states):
    """Find the best matching city using fuzzy matching."""
    best_match = process.extractOne(state, valid_states, score_cutoff=80)
    return best_match[0] if best_match else None

def add_best_match_column(data, city_key, valid_cities):
    """
    Add a new key 'BEST_MATCH_CITY' to each dictionary in the list.
    
    Args:
        data (list of dict): The sampled rows as a list of dictionaries.
        city_key (str): The key to look up city names in the dictionaries.
        valid_cities (list): List of valid city names for matching.
    
    Returns:
        list of dict: The updated data with the new 'BEST_MATCH_CITY' key.
    """
    for row in data:
        state = row.get("STATE", "")
        row["BEST_MATCH_STATE"] = get_best_match(state, valid_states)
    return data

# Example usage
file_path = "sample_file.csv"  # Replace with your file path
sampled_data = get_sample('data/People.csv', 100, seed=42)

# Add the best match for the 'CITY' column
sampled_data_with_matches = add_best_match_column(sampled_data, "STATE", valid_states)

# Print the updated rows (for demonstration purposes)
for row in sampled_data_with_matches[:10]:  # Display first 10 rows
    print(row)



{'PERSON_ID': 'O561555', 'PERSON_TYPE': 'DRIVER', 'RD_NO': 'JC113649', 'VEHICLE_ID': '535742.0', 'CRASH_DATE': '01/12/2019 12:01:00 AM', 'CITY': '', 'STATE': '', 'SEX': 'X', 'AGE': '', 'SAFETY_EQUIPMENT': 'USAGE UNKNOWN', 'AIRBAG_DEPLOYED': 'DEPLOYMENT UNKNOWN', 'EJECTION': 'UNKNOWN', 'INJURY_CLASSIFICATION': 'NO INDICATION OF INJURY', 'DRIVER_ACTION': 'UNKNOWN', 'DRIVER_VISION': 'UNKNOWN', 'PHYSICAL_CONDITION': 'UNKNOWN', 'BAC_RESULT': 'TEST NOT OFFERED', 'DAMAGE_CATEGORY': 'OVER $1,500', 'DAMAGE': '6069.0895692240765', 'BEST_MATCH_STATE': None}
{'PERSON_ID': 'O155095', 'PERSON_TYPE': 'DRIVER', 'RD_NO': 'JA255517', 'VEHICLE_ID': '154240.0', 'CRASH_DATE': '05/06/2017 11:13:00 AM', 'CITY': 'CICERO', 'STATE': 'IL', 'SEX': 'M', 'AGE': '', 'SAFETY_EQUIPMENT': 'SAFETY BELT USED', 'AIRBAG_DEPLOYED': 'DID NOT DEPLOY', 'EJECTION': 'NONE', 'INJURY_CLASSIFICATION': 'NO INDICATION OF INJURY', 'DRIVER_ACTION': 'FAILED TO YIELD', 'DRIVER_VISION': 'UNKNOWN', 'PHYSICAL_CONDITION': 'UNKNOWN', 'BAC_RES

In [26]:
for index, row in people.iterrows():
    original_city = row["CITY"]
    corrected_city = correggi_nome_citta(original_city)
    print(f"Original: {original_city}, Corrected: {corrected_city}")

NameError: name 'correggi_nome_citta' is not defined