In [1]:
import json
import math

In [2]:
def read_data_from_file(filename):
    """
    Reads data from a file and returns it as a string.
    """
    with open(filename, 'r') as file:
        return file.read()

In [3]:
def parse_data(data):
    """
    Parses the provided data into key-value pairs of zip code and borough.
    Returns a dictionary of zip code: borough pairs, and lists of zip codes and boroughs.
    """
    zip_borough_dict = {}
    zip_codes = []
    boroughs = set()

    for line in data.splitlines():
        zip_code, borough = line.split("\t")
        zip_code = int(zip_code)  # Convert zip code to integer
        zip_borough_dict[zip_code] = borough
        zip_codes.append(zip_code)
        boroughs.add(borough)

    # Sort zip codes
    zip_codes.sort()

    # Create a sorted dictionary by zip code (key)
    sorted_zip_borough_dict = {k: zip_borough_dict[k] for k in sorted(zip_borough_dict)}

    return sorted_zip_borough_dict, zip_codes, list(boroughs)

In [4]:
def parse_neighborhood_data(data):
    """
    Parses the provided data into key-value pairs of zip code and [UHF, neighborhood name].
    Returns a dictionary of these pairs.
    """
    neighborhood_dict = {}
    all_zip_codes = []

    for line in data.splitlines():
        parts = line.split(', ')
        uhf, neighborhood = parts[0], parts[1]
        zip_codes = parts[2:]

        for zip_code in zip_codes:
            neighborhood_dict[zip_code] = [uhf, neighborhood]
            all_zip_codes.append(zip_code)

    return neighborhood_dict, all_zip_codes

In [5]:
def parse_zip_code_data(text):
    result = {}
    lines = text.split('\n')
    for line in lines:
        if line.strip():  # Check if line is not empty
            zip_code, latitude, longitude = [item.strip() for item in line.split(',')]
            if latitude not in result:
                result[latitude] = {}
            result[latitude][longitude] = zip_code
    return result

In [6]:
def save_to_json(data, filename):
    """
    Saves the provided data to a JSON file.
    """
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)


In [7]:
file_path = "all_zip.txt"
data = read_data_from_file(file_path)
zip_borough_dict, zip_codes, boroughs = parse_data(data)
save_to_json(zip_borough_dict, "zip_borough.json")

In [8]:
file_path = "all_uhf.txt"
data = read_data_from_file(file_path)
neighborhood_dict, zip_codes = parse_neighborhood_data(data)
save_to_json(neighborhood_dict, "neighborhood_data.json")

In [9]:
file_path = "all_coordinates.txt"
file_content = read_data_from_file(file_path)
parsed_data = parse_zip_code_data(file_content)
save_to_json(parsed_data, "coordinates.json")

In [38]:
from geopy.distance import geodesic

def read_zipcode_file(filename):
    """Reads the zip code data from a file and returns a mapping of zip codes to coordinates."""
    zipcode_mapping = {}

    with open(filename, 'r') as file:
        for line in file:
            zip_code, lat, lon = line.strip().split(', ')
            lat_lon = (float(lat), float(lon))
            zipcode_mapping[zip_code] = lat_lon

    return zipcode_mapping

def find_nearest_zipcode(target_lat, target_lon, zipcode_mapping):
    """Finds the nearest zip code to the given latitude and longitude."""
    nearest_zip = None
    shortest_distance = None

    for zip_code, (zip_lat, zip_lon) in zipcode_mapping.items():
        distance = geodesic((target_lat, target_lon), (zip_lat, zip_lon)).miles
        if shortest_distance is None or distance < shortest_distance:
            nearest_zip = zip_code
            shortest_distance = distance

    return nearest_zip

file_path = "all_coordinates.txt"
zipcode_mapping = read_zipcode_file(file_path)

lat, lon = 40.710087, -74.00533
nearest_zip = find_nearest_zipcode(lat, lon, zipcode_mapping)
print(f"The nearest zip code is: {nearest_zip}")


The nearest zip code is: 10038


In [35]:
# (40.76394, -74.97903)
# expected: 10019 - outlier

# (40.710087, -74.00533)
# expected: 10038

# (40.680252, -73.94954)
# expected: 11216

# (40.6923, -73.98285)
# expected: 11201

# (40.713013, -74.00398)
# expected: 10007 or 10278

# (40.74987, -73.983765)
# expected: 10018 or 10165

In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("test").getOrCreate()
df = spark.createDataFrame([(1, "foo"), (2, "bar")], ["id", "label"])

pandas_df = df.toPandas()

# Specify the path where you want to save the CSV file
output_path = "output.csv"

# Save the Pandas DataFrame as a CSV file
pandas_df.to_csv(output_path, index=False)
