This code calculates the total funding amount, total population, median household income, and percentage of the population identifying as White for each city. It groups the data by city, computes funding per capita, and sorts the cities in descending order based on funding per capita, displaying the top results.


In [19]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(r"https://raw.githubusercontent.com/DazzedUpDas/Markey-Dataset/main/new_data.csv")
df.columns

Index(['Unique ID', 'Agency Name', 'Bureau Name', 'Program Name', 'Category',
       'Subcategory', 'Project Name', 'City', 'County', 'State',
       'Funding Amount Excluding Loans', 'Funding Source', 'Program Type',
       'Project on Map', 'Historic ID', 'GEO_ID', 'Total_Population',
       'One_Race', 'White_Alone', 'Black_or_African_American_alone',
       'American_Indian_and_Alaska_Native_alone', 'Asian_alone',
       'Native_Hawaiian_and_Other_Pacific_Islander_alone',
       'Some_Other_Race_alone', 'Population_of_two_or_more_races',
       'Median_household_income'],
      dtype='object')

## Map of Massachusetts by county ##

In [47]:
from geopy.geocoders import Nominatim
import time

# List of unique counties
counties = list(df['County'].unique())

# Create dictionary to store county coordinates
geolocator = Nominatim(user_agent="massachusetts_counties")
county_coordinates = {}

print("Fetching coordinates for counties...")
for county in counties:
    try:
        location = geolocator.geocode(f"{county}, Massachusetts")
        if location:
            county_coordinates[county] = (location.latitude, location.longitude)
        else:
            county_coordinates[county] = None
        time.sleep(1)  # Avoid hitting the API rate limit
    except Exception as e:
        print(f"Error fetching coordinates for {county}: {e}")
        county_coordinates[county] = None

# Print or save the county coordinates
print(county_coordinates)

# Optionally save to a CSV file
coordinates_df = pd.DataFrame.from_dict(county_coordinates, orient='index', columns=['Latitude', 'Longitude'])
coordinates_df.reset_index(inplace=True)
coordinates_df.rename(columns={'index': 'County'}, inplace=True)
coordinates_df.to_csv('massachusetts_county_coordinates.csv', index=False)

print("Coordinates fetched and saved to 'massachusetts_county_coordinates.csv'.")

Fetching coordinates for counties...
{'Plymouth': (41.9426657, -70.7618592), 'Middlesex': (42.485452, -71.3968261), 'Hampshire': (42.3432499, -72.6213339), 'Essex': (42.6320389, -70.7828255), 'Dukes': (41.3926378, -70.642011), 'Bristol': (41.7425538, -71.0856545), 'Barnstable': (41.7016936, -70.3036163), 'Worcester': (42.2625621, -71.8018877), 'Norfolk': (42.1538607, -71.1828015), 'Suffolk': (42.3544455, -70.9788771), 'Franklin': (42.5896205, -72.6110645), 'Hampden': (42.1285315, -72.6063441), 'Berkshire': (42.3999954, -73.2322639), 'Nantucket': (41.2727997, -70.0951867), 'Berkshire ': (42.3999954, -73.2322639)}
Coordinates fetched and saved to 'massachusetts_county_coordinates.csv'.


Display map code

In [50]:
import pandas as pd
import folium
import json
import requests
from folium.features import GeoJsonTooltip
from IPython.display import display

df = pd.read_csv(r"https://raw.githubusercontent.com/DazzedUpDas/Markey-Dataset/main/new_data.csv")

# Population data from https://malegislature.gov/Redistricting/MassachusettsCensusData/County
county_coordinates = {
    "Plymouth": (41.9426657, -70.7618592),
    "Middlesex": (42.485452, -71.3968261),
    "Hampshire": (42.3432499, -72.6213339),
    "Essex": (42.6320389, -70.7828255),
    "Dukes": (41.3926378, -70.642011),
    "Bristol": (41.7425538, -71.0856545),
    "Barnstable": (41.7016936, -70.3036163),
    "Worcester": (42.2625621, -71.8018877),
    "Norfolk": (42.1538607, -71.1828015),
    "Suffolk": (42.3544455, -70.9788771),
    "Franklin": (42.5896205, -72.6110645),
    "Hampden": (42.1285315, -72.6063441),
    "Berkshire": (42.3999954, -73.2322639),
    "Nantucket": (41.2727997, -70.0951867)
}

county_population = {
    "Plymouth": 530819,
    "Middlesex": 1632002,
    "Hampshire": 162308,
    "Essex": 809829,
    "Dukes": 20600,
    "Bristol": 579200,
    "Barnstable": 228996,
    "Worcester": 862111,
    "Norfolk": 725981,
    "Suffolk": 797936,
    "Franklin": 71029,
    "Hampden": 465825,
    "Berkshire": 129026,
    "Nantucket": 14255
}

# Group by 'County' to calculate funding and income
county_stats = df[df['County'].isin(county_coordinates.keys())].groupby('County').agg({
    'Funding Amount Excluding Loans': 'sum',
}).reset_index()

# Add population data from the dictionary
county_stats['Total_Population'] = county_stats['County'].map(county_population)

# Calculate funding per capita
county_stats['Funding_Per_Capita'] = county_stats['Funding Amount Excluding Loans'] / county_stats['Total_Population']

# Normalize funding for better visualization
county_stats['Funding (in Millions)'] = county_stats['Funding Amount Excluding Loans'] / 1e6

# GetGeoJSON data from the URL
geojson_url = "https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json"
response = requests.get(geojson_url)
geojson_data = response.json()

# Filter GeoJSON for Massachusetts counties only(Mass is state number 25)
massachusetts_geojson_features = [
    feature for feature in geojson_data['features']
    if feature['properties']['NAME'] in county_coordinates.keys()
    and feature['properties'].get('STATE') == "25"  
]
geojson_data['features'] = massachusetts_geojson_features



# Make dictionary for county funding data
funding_data = {
    row['County']: {
        "funding": row['Funding (in Millions)'],
        "population": row['Total_Population'],
        "funding_per_capita": row['Funding_Per_Capita']
    }
    for _, row in county_stats.iterrows()
}

# Add funding, population, and funding per capita to the GeoJSON
for feature in geojson_data['features']:
    county_name = feature['properties']['NAME']
    if county_name in funding_data:
        feature['properties']['Funding'] = funding_data[county_name]['funding']
        feature['properties']['Population'] = funding_data[county_name]['population']
        feature['properties']['Funding_Per_Capita'] = funding_data[county_name]['funding_per_capita']  # Add funding per capita
    else:
        feature['properties']['Funding'] = None
        feature['properties']['Population'] = None
        feature['properties']['Funding_Per_Capita'] = None  # Set funding per capita to None if not found


# Create the base map
ma_map = folium.Map(location=[42.4072, -71.3824], zoom_start=8)

# Create a Choropleth layer for funding distribution
choropleth = folium.Choropleth(
    geo_data=geojson_data,
    data=county_stats,
    columns=["County", "Funding_Per_Capita"],
    key_on="feature.properties.NAME",
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Funding_Per_Capita (USD)",
    threshold_scale=[10, 250, 490, 730, 1200]
).add_to(ma_map)

# Add tooltips with hover information
tooltip = GeoJsonTooltip(
    fields=["NAME", "Funding", "Population", "Funding_Per_Capita"],
    aliases=["County:", "Funding (Millions):", "Population:", "Funding_Per_Capita (USD):"],
    localize=True,
    sticky=True,
    labels=True,
    style=(
        "background-color: white; "
        "border: 1px solid black; "
        "border-radius: 3px; "
        "padding: 5px;"
    )
)

# Add GeoJSON layer for interactivity
folium.GeoJson(
    geojson_data,
    style_function=lambda x: {
        'color': 'black',
        'weight': 0.5,
        'fillOpacity': 0.7,
    },
    tooltip=tooltip
).add_to(ma_map)

display(ma_map)
# Sort cities by Funding_Per_Capita in descending order
sorted_county_stats = county_stats.sort_values(by='County', ascending=False).reset_index(drop=True)

print(sorted_county_stats[['County', 'Funding (in Millions)', 'Total_Population', 'Funding_Per_Capita',]])



        County  Funding (in Millions)  Total_Population  Funding_Per_Capita
0    Worcester              68.174689            862111           79.078784
1      Suffolk             848.155832            797936         1062.937168
2     Plymouth              69.588039            530819          131.095607
3      Norfolk              18.056561            725981           24.871947
4    Nantucket               5.206773             14255          365.259418
5    Middlesex             113.633914           1632002           69.628538
6    Hampshire               7.212826            162308           44.439129
7      Hampden             201.158323            465825          431.832390
8     Franklin              10.748500             71029          151.325515
9        Essex             149.861438            809829          185.053188
10       Dukes               2.046981             20600           99.368010
11     Bristol             131.927995            579200          227.776234
12   Berkshi

This code calculates the total funding amount, total population, median household income, and percentage of the population identifying as White for each city. It groups the data by city, computes funding per capita, and sorts the cities in descending order based on funding per capita, displaying the top results.

In [18]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure all columns print side by side
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

# Group by 'City' to get total funding amount, total population, and White population for each city
city_stats = df.groupby('City').agg({
    'Funding Amount Excluding Loans': 'sum',
    'Total_Population': 'max', 
    'White_Alone': 'max',
    'Median_household_income': 'max'
}).reset_index()

# Calculate funding per capita
city_stats['Funding_Per_Capita'] = city_stats['Funding Amount Excluding Loans'] / city_stats['Total_Population']

# Calculate percentage of White population
city_stats['Percentage_White'] = (city_stats['White_Alone'] / city_stats['Total_Population']) * 100

# Sort cities by Funding_Per_Capita in descending order
sorted_city_stats = city_stats.sort_values(by='Median_household_income', ascending=False).reset_index(drop=True)


print(sorted_city_stats[['City', 'Funding_Per_Capita', 'Percentage_White', 'Median_household_income']])



            City  Funding_Per_Capita  Percentage_White  \
0         weston           10.399629         75.149776   
1      hopkinton            2.700874         74.096386   
2        harvard            4.105240         80.937089   
3        norwell           33.124835         91.877368   
4        concord          182.188632         82.943053   
..           ...                 ...               ...   
144   greenfield          115.675934         84.725349   
145   fall river          252.029043         73.361702   
146     lawrence          135.688613         20.009423   
147      holyoke         3073.111381         52.262148   
148  springfield          343.173624         36.721841   

     Median_household_income  
0                     206250  
1                     172683  
2                     170250  
3                     162091  
4                     160392  
..                       ...  
144                    46250  
145                    46007  
146                    4

THis calculates sorted stats for counties

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure all columns print side by side
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

# Group by 'County' to get total funding amount, total population, and White population for each city
city_stats = df.groupby('County').agg({
    'Funding Amount Excluding Loans': 'sum',
    'Total_Population': 'max', 
    'White_Alone': 'max',
    'Median_household_income': 'max'
}).reset_index()

# Calculate funding per capita
city_stats['Funding_Per_Capita'] = city_stats['Funding Amount Excluding Loans'] / city_stats['Total_Population']

# Calculate percentage of White population
city_stats['Percentage_White'] = (city_stats['White_Alone'] / city_stats['Total_Population']) * 100

# Sort cities by Funding_Per_Capita in descending order
sorted_city_stats = city_stats.sort_values(by='Median_household_income', ascending=False).reset_index(drop=True)


print(sorted_city_stats[['City', 'Funding_Per_Capita', 'Percentage_White', 'Median_household_income']])


In [None]:
import pandas as pd
import folium
import requests
from folium.features import GeoJsonTooltip

# Load the data
df = pd.read_csv(r"https://raw.githubusercontent.com/DazzedUpDas/Markey-Dataset/main/new_data.csv")

# Define Massachusetts counties and coordinates
county_coordinates = {
    "Plymouth": (41.9426657, -70.7618592),
    "Middlesex": (42.485452, -71.3968261),
    "Hampshire": (42.3432499, -72.6213339),
    "Essex": (42.6320389, -70.7828255),
    "Dukes": (41.3926378, -70.642011),
    "Bristol": (41.7425538, -71.0856545),
    "Barnstable": (41.7016936, -70.3036163),
    "Worcester": (42.2625621, -71.8018877),
    "Norfolk": (42.1538607, -71.1828015),
    "Suffolk": (42.3544455, -70.9788771),
    "Franklin": (42.5896205, -72.6110645),
    "Hampden": (42.1285315, -72.6063441),
    "Berkshire": (42.3999954, -73.2322639),
    "Nantucket": (41.2727997, -70.0951867)
}

# Group by 'County' to calculate total funding, population, etc.
county_stats = df[df['County'].isin(county_coordinates.keys())].groupby('County').agg({
    'Funding Amount Excluding Loans': 'sum',
    'Total_Population': 'max',
    'Median_household_income': 'max'
}).reset_index()

# Normalize funding for better visualization
county_stats['Funding (in Millions)'] = county_stats['Funding Amount Excluding Loans'] / 1e6

# Fetch GeoJSON data
geojson_url = "https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json"
response = requests.get(geojson_url)
geojson_data = response.json()

# Filter GeoJSON for Massachusetts counties only
massachusetts_geojson_features = [
    feature for feature in geojson_data['features']
    if feature['properties']['NAME'] in county_coordinates.keys()
]
geojson_data['features'] = massachusetts_geojson_features

# Create a dictionary for county funding data
funding_data = {
    row['County']: {
        "funding": row['Funding (in Millions)'],
        "population": row['Total_Population'],
        "income": row['Median_household_income']
    }
    for _, row in county_stats.iterrows()
}

# Add funding, population, and income properties to the GeoJSON
for feature in geojson_data['features']:
    county_name = feature['properties']['NAME']
    if county_name in funding_data:
        feature['properties']['Funding'] = funding_data[county_name]['funding']
        feature['properties']['Population'] = funding_data[county_name]['population']
        feature['properties']['Income'] = funding_data[county_name]['income']
    else:
        feature['properties']['Funding'] = None
        feature['properties']['Population'] = None
        feature['properties']['Income'] = None

# Create the base map
ma_map = folium.Map(location=[42.4072, -71.3824], zoom_start=8)

# Create a Choropleth layer for funding distribution
choropleth = folium.Choropleth(
    geo_data=geojson_data,
    data=county_stats,
    columns=["County", "Funding (in Millions)"],
    key_on="feature.properties.NAME",
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Total Funding (in Millions)"
).add_to(ma_map)

# Add tooltips with hover information
tooltip = GeoJsonTooltip(
    fields=["NAME", "Funding", "Population", "Income"],
    aliases=["County:", "Funding (Millions):", "Population:", "Median Household Income:"],
    localize=True,
    sticky=True,
    labels=True,
    style=(
        "background-color: white; "
        "border: 1px solid black; "
        "border-radius: 3px; "
        "padding: 5px;"
    )
)

# Add GeoJSON layer for interactivity
folium.GeoJson(
    geojson_data,
    style_function=lambda x: {
        'fillColor': 'red' if x['properties']['Funding'] else 'gray',
        'color': 'black',
        'weight': 0.5,
        'fillOpacity': 0.7,
    },
    tooltip=tooltip
).add_to(ma_map)

# Save the map
ma_map.save("massachusetts_funding_map_interactive2.html")

print("Interactive map saved as 'massachusetts_funding_map_interactive2.html'. Open this file in your browser to view it.")



Interactive map saved as 'massachusetts_funding_map_interactive2.html'. Open this file in your browser to view it.


This code creates a scatter plot comparing the funding per capita to percent of the city's population that identifies as White. Note - only cities with at least 5000 people were included in this comparison. There is no obvious correlation between the percent White of a city and its funding per capita with a correlation coefficient of -0.14.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Only look at cities with at least 5000 people and Funding Per Capita below 3000 to remove outliers
population_threshold = 5000
funding_cap = 3000

# Filter out cities by thresholds
filtered_city_stats = city_stats[(city_stats['Total_Population'] >= population_threshold) & (city_stats['Funding_Per_Capita'] < funding_cap)]

# Find correlation from filtered data
filtered_correlation = filtered_city_stats['Funding_Per_Capita'].corr(filtered_city_stats['Percentage_White'])

# Calculate the line of best fit using numpy
X = filtered_city_stats['Percentage_White']
y = filtered_city_stats['Funding_Per_Capita']

# Perform linear regression
slope, intercept = np.polyfit(X, y, 1)

# Generate the regression line
line_of_best_fit = slope * X + intercept

# Make scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6)
plt.plot(X, line_of_best_fit, color='blue', linewidth=3, label='Regression Line')
plt.xlabel('Percentage of population identifying as White (%)', fontsize=14)  # Increase font size for x-axis
plt.ylabel('Funding Per Capita (USD)', fontsize=14)  # Increase font size for y-axis
plt.title('Funding Per Capita vs Percentage of population identifying as White for cities in Massachusetts')

# Add correlation on the plot
plt.text(0.75, 0.95, f'Correlation: {filtered_correlation:.2f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white"))

plt.show()

Look at funding per captia vs median housing income. Holyoke was a major outlier so it was removed by adding a funding_cap of $3000 per capita. This however only changed the correlation coefficiten from -0.30 to -0.29 suggesting a moderate negative relationship.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Only look at cities with at least 5000 people and Funding Per Capita below 3000 to remove outliers
population_threshold = 5000
funding_cap = 3000

# Filter out cities by threshold
filtered_city_stats = city_stats[(city_stats['Total_Population'] >= population_threshold) & (city_stats['Funding_Per_Capita'] < funding_cap)]

# Calculate the correlation on the filtered data
income_correlation = filtered_city_stats['Funding_Per_Capita'].corr(filtered_city_stats['Median_household_income'])

# Calculate the line of best fit using numpy
X = filtered_city_stats['Median_household_income']
y = filtered_city_stats['Funding_Per_Capita']

# Perform linear regression
slope, intercept = np.polyfit(X, y, 1)

# Generate the regression line
line_of_best_fit = slope * X + intercept

# MAke scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(filtered_city_stats['Median_household_income'], filtered_city_stats['Funding_Per_Capita'], alpha=0.6)
plt.plot(X, line_of_best_fit, color='blue', linewidth=3, label='Regression Line')
plt.xlabel('Median Household Income (USD)', fontsize=14)  # Increase font size for x-axis
plt.ylabel('Funding Per Capita (USD)', fontsize=14)  # Increase font size for y-axis
plt.title('Funding Per Capita vs Median Household Income for cities in Massachusetts ')

# Add correlation to the plot
plt.text(0.75, 0.95, f'Correlation: {income_correlation:.2f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white"))

plt.show()

CLustering based on % White and Median household income. Of note - there are poor white towns, rich white towns, poor non-white towns but virtually no rich non-white towns.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

data = city_stats[['Percentage_White', 'Median_household_income']]

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Use the elbow method to find the optimal number of clusters
# We can always adjust number of clusters depending on graph
wcss = []
max_clusters = 10 \

for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

# Update based on the elbow point from the plot
optimal_clusters = 3

# K-means clustering
kmeans_optimal = KMeans(n_clusters=optimal_clusters, random_state=42)
city_stats['Cluster'] = kmeans_optimal.fit_predict(data_scaled)

# Plot clusters
plt.figure(figsize=(10, 6))
plt.scatter(city_stats['Percentage_White'], city_stats['Median_household_income'], c=city_stats['Cluster'], cmap='viridis', alpha=0.6)
plt.xlabel('Percentage of Population that identifies as White (%)')
plt.ylabel('Median Household Income (USD)')
plt.title(f'K-means Clustering of Cities in Mass (k={optimal_clusters})')
plt.colorbar(label='Cluster')
plt.show()

Lets say cambridge is a very diverse neighborhood, If you send more grant money to cambridge is it possible that you are just making more white college educated people move there, drive rent prices up and clear out lower income people? Is this ironic way of helping poor people?

In [None]:

# import geopandas as gpd
# import folium

# # Assuming city coordinates are available in a separate file or can be added to the dataset.
# # Load a basic map centered on your region of interest
# map_center = [42.3601, -71.0589]  # Coordinates for Boston, MA
# map_plot = folium.Map(location=map_center, zoom_start=8)

# # Add funding data points to the map (mock coordinates)
# for _, row in df.iterrows():
#     if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
#         folium.CircleMarker(
#             location=(row['Latitude'], row['Longitude']),
#             radius=5,
#             color='blue',
#             fill=True,
#             fill_color='blue',
#             fill_opacity=0.6,
#             popup=f"{row['City']}: ${row['Funding Amount Excluding Loans']}"
#         ).add_to(map_plot)

# # Display the map
# map_plot




In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file


# Strip column names to remove any leading or trailing spaces
df.columns = df.columns.str.strip()

# Check if the columns 'County' and 'Funding Amount Excluding Loans' exist
if "County" in df.columns and "Funding Amount Excluding Loans" in df.columns:
    # Drop rows with NaN values in the 'County' and 'Funding Amount Excluding Loans' columns
    df = df.dropna(subset=["County", "Funding Amount Excluding Loans"])

    # Convert the 'Funding Amount Excluding Loans' column to numeric, forcing errors to NaN (ignoring non-numeric values)
    df['Funding Amount Excluding Loans'] = pd.to_numeric(df['Funding Amount Excluding Loans'].replace(r'[\$,]', '', regex=True), errors='coerce')

    # Group by 'County' and sum up the 'Funding Amount Excluding Loans'
    county_funding = df.groupby('County')['Funding Amount Excluding Loans'].sum().reset_index()

    # Calculate the percentage of loans for each county
    county_funding['Percentage'] = (county_funding['Funding Amount Excluding Loans'] / county_funding['Funding Amount Excluding Loans'].sum()) * 100

    # Print the result
    print("Percentage of grant funding by county:")
    print(county_funding)

    # Plotting a horizontal bar chart for the percentage of loans by county
    plt.figure(figsize=(12, 8))
    plt.barh(county_funding['County'], county_funding['Percentage'], color='skyblue')
    plt.xlabel('Percentage of Total Funding (%)')
    plt.ylabel('County')
    plt.title('Percentage of Grant Funding by County')
    plt.grid(axis='x', linestyle='--', alpha=0.7)

    # Show the bar chart
    plt.tight_layout()
    plt.show()

else:
    print("One or both of the specified columns do not exist in the DataFrame.")

Zip Codes

In [None]:
import geopandas as gpd

# Download data here
# https://www.mass.gov/info-details/massgis-data-zip-codes-5-digit-from-here-navteq

# Specify the path to the shapefile (without the .shp extension)
shapefile_path = r"C:\BU_MSDS\DS701\Markey\zipcodes_nt\ZIPCODES_NT_POLY"

# Read the shapefile using geopandas
try:
    gdf = gpd.read_file(f"{shapefile_path}.shp")

    # Display the first few rows of the GeoDataFrame
    print("Contents of the GeoDataFrame:")
    print(gdf.to_string(index=False))

    # Check for a column that might contain ZIP codes
    if 'POSTCODE' in gdf.columns:  # Replace 'ZIP_CODE' with the actual column name if different
        zip_codes = gdf['POSTCODE'].unique()  # Get unique ZIP codes
        print("Found ZIP Codes:")
        for code in zip_codes:
            print(code)
    else:
        print("No ZIP_CODE column found in the shapefile.")

    # Save the attribute data (without geometry) to CSV
    csv_path = "C:\BU_MSDS\DS701\Markey\zipcodes_nt\zipcodes.csv"  # New path with a .csv extension
    gdf.drop(columns='geometry').to_csv(csv_path, index=False)

except FileNotFoundError:
    print("Shapefile not found. Please check the path.")
except Exception as e:
    print("An unexpected error occurred:", str(e))

finally:
    # You can add code here that will always execute, 
    # regardless of exceptions (e.g., closing files)
    pass  # For now, we have nothing to add here

print("Processing complete.")   

zipcode convert csv

In [None]:
file_path = r"c:\BU_MSDS\DS701\Markey\zipcodes_nt\zipcodes.csv"
df = pd.read_csv(file_path)

print(df.head())

Massachussetts Map

In [None]:
from geopy.geocoders import Nominatim
import time
import folium
import pandas as pd

# Group data to find funding and population per city
population_per_city = df.groupby('City')['Total_Population'].sum().reset_index()
funding_per_city = df.groupby('City')['Funding Amount Excluding Loans'].sum().reset_index()
population_funding = population_per_city.merge(funding_per_city, on='City')
population_funding['Per_Capita_Funding'] = (
    population_funding['Funding Amount Excluding Loans'] / population_funding['Total_Population']
)

# Create dictionary to store city coordinates
geolocator = Nominatim(user_agent="massachusetts_cities")
cities = list(population_funding['City'].unique())
city_coordinates = {}

for city in cities:
    location = geolocator.geocode(f"{city}, Massachusetts")
    if location:
        city_coordinates[city] = (location.latitude, location.longitude)
    else:
        city_coordinates[city] = None
    time.sleep(1)

Make Map

In [None]:
# import pandas as pd
# import folium
# from folium import GeoJsonTooltip

# # Load the dataset
# df = pd.read_csv("https://raw.githubusercontent.com/DazzedUpDas/Markey-Dataset/main/new_data.csv")

# # Prepare the data: Aggregate total loans and list grants by county
# county_data = df.groupby('County').agg({
#     'Funding Amount Excluding Loans': 'sum',
#     'Project Name': lambda x: '<br>'.join(x.dropna().astype(str))  # Ensure all values are strings
# }).reset_index()

# # Load Massachusetts counties GeoJSON file (replace with your actual file path)
# geojson_path = "path_to_massachusetts_counties_geojson.json"

# # Create the map centered on Massachusetts
# ma_map = folium.Map(location=[42.4072, -71.3824], zoom_start=8)

# # Add county data to the GeoJSON file for binding
# import json
# with open(geojson_path) as f:
#     counties = json.load()

# # Map county names in GeoJSON to the county data
# for feature in counties['features']:
#     county_name = feature['properties']['NAME']
#     matching_row = county_data[county_data['County'].str.lower() == county_name.lower()]
#     if not matching_row.empty:
#         feature['properties']['total_loans'] = matching_row['Funding Amount Excluding Loans'].values[0]
#         feature['properties']['grants'] = matching_row['Project Name'].values[0]

# # Add counties to the map with popups
# folium.GeoJson(
#     counties,
#     style_function=lambda x: {
#         'fillColor': 'blue',
#         'color': 'black',
#         'weight': 1,
#         'fillOpacity': 0.5
#     },
#     tooltip=GeoJsonTooltip(
#         fields=['NAME', 'total_loans'],
#         aliases=['County:', 'Total Loan Amount:'],
#         localize=True
#     ),
#     popup=lambda x: folium.Popup(f"Grants:<br>{x['properties']['grants']}", max_width=300)
# ).add_to(ma_map)

# # Save or display the map
# ma_map.save("massachusetts_grants_map.html")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Filter cities with a population between 50,000 and 300,000
city_stats_filtered = city_stats[(city_stats['Total_Population'] > 50000) & (city_stats['Total_Population'] <= 100000)]


# Select the relevant columns for clustering
data = city_stats_filtered[['Funding Amount Excluding Loans', 'Total_Population']]

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Use the elbow method to find the optimal number of clusters
wcss = []
max_clusters = 10

for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve to determine the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal Number of Clusters (Population <= 100,000)')
plt.show()

# Set the optimal number of clusters based on the elbow plot
optimal_clusters = 3  # Adjust based on the elbow plot

# Perform K-means clustering with the optimal number of clusters
kmeans_optimal = KMeans(n_clusters=optimal_clusters, random_state=42)
city_stats_filtered['Cluster'] = kmeans_optimal.fit_predict(data_scaled)

# Plot the clusters based on Total Funding Amount and Total Population
plt.figure(figsize=(10, 6))
plt.scatter(city_stats_filtered['Funding Amount Excluding Loans'], city_stats_filtered['Total_Population'], c=city_stats_filtered['Cluster'], cmap='viridis', alpha=0.6)
plt.xlabel('Total Funding Amount (USD)')
plt.ylabel('Total Population')
plt.title(f'K-means Clustering of Cities by Funding and Population (Population <= 100,000, k={optimal_clusters})')
plt.colorbar(label='Cluster')
plt.show()