<a href="https://colab.research.google.com/github/ferflorespr/best_post_graduation_cities/blob/main/best_city.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# File: visualization_api.ipynb

# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Optional: Interactive Visuals
import plotly.express as px

# Suppress Warnings (optional)
import warnings
warnings.filterwarnings('ignore')

print("Visualization API Notebook Initialized")


Visualization API Notebook Initialized


In [9]:
# Function to load data (supports CSV for now)
def load_data(filepath):
    """
    Load data from CSV file.
    """
    try:
        data = pd.read_csv('SofwareDeveloperIncomeExpensesperUSACity.csv')
        print(f"Loaded dataset with shape {data.shape}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")


In [10]:
# Function for Quick Summary
def summarize_data(df):
    """
    Display basic summary of the dataset.
    """
    display(df.head())
    print("\nData Summary:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())


In [11]:
# Scatter Plot with Plotly
def scatter_plot(df, x_col, y_col, color_col=None, title="Scatter Plot"):
    """
    Generate an interactive scatter plot.
    """
    fig = px.scatter(df, x=x_col, y=y_col, color=color_col, title=title)
    fig.show()


In [18]:
# Example
data = load_data('SofwareDeveloperIncomeExpensesperUSACity.csv')
summarize_data(data)

scatter_plot(data,
             x_col='Cost of Living avg',
             y_col='Mean Software Developer Salary (adjusted)',
             color_col='City',
             title='Adjusted Salary vs Cost of Living (by City)')



# Compute Salary-to-Cost-of-Living Ratio


data['Salary_to_COL'] = data['Mean Software Developer Salary (adjusted)'] / data['Cost of Living avg']

# Sort and display top 10 cities with the best ratio
top_cities = data.sort_values('Salary_to_COL', ascending=False)[['City', 'Salary_to_COL', 'Mean Software Developer Salary (adjusted)', 'Cost of Living avg']]

top_cities.head(10)



Loaded dataset with shape (77, 12)


Unnamed: 0.1,Unnamed: 0,Metro,Mean Software Developer Salary (adjusted),Mean Software Developer Salary (unadjusted),Mean Unadjusted Salary (all occupations),Number of Software Developer Jobs,Median Home Price,City,Cost of Living avg,Rent avg,Cost of Living Plus Rent avg,Local Purchasing Power avg
0,0,"Columbus, OH",117552.0,108500.0,51260.0,13430.0,192000.0,"Columbus, OH",984.8,1421.5,2856.5,9335.4
1,1,"Seattle-Tacoma-Bellevue, WA",117323.0,131167.0,65400.0,65760.0,491600.0,"Seattle, WA",1250.7,2528.2,4091.5,8971.3
2,2,"Charlotte-Concord-Gastonia, NC-SC",114122.0,107046.0,51000.0,12800.0,208500.0,"Charlotte, NC",989.9,1974.5,3221.1,8939.8
3,3,"Colorado Springs, CO",112118.0,111670.0,51430.0,5780.0,296500.0,"Colorado Springs, CO",1049.2,1594.0,3094.5,8493.1
4,4,"Dayton, OH",111616.0,99338.0,50100.0,4240.0,124100.0,"Dayton, OH",961.2,1072.1,2586.0,4887.7



Data Summary:
       Unnamed: 0  Mean Software Developer Salary (adjusted)  \
count   77.000000                                  77.000000   
mean    38.000000                              100832.974026   
std     22.371857                                8064.298677   
min      0.000000                               72811.000000   
25%     19.000000                               95308.000000   
50%     38.000000                              101256.000000   
75%     57.000000                              107170.000000   
max     76.000000                              117552.000000   

       Mean Software Developer Salary (unadjusted)  \
count                                    77.000000   
mean                                 101866.207792   
std                                   13926.636588   
min                                   80314.000000   
25%                                   91976.000000   
50%                                  100614.000000   
75%                           

Unnamed: 0,City,Salary_to_COL,Mean Software Developer Salary (adjusted),Cost of Living avg
8,"San Antonio, TX",125.407667,110898.0,884.3
28,"Wichita, KS",124.552553,103690.0,832.5
13,"Cincinnati, OH",119.82084,109013.0,909.8
0,"Columbus, OH",119.366369,117552.0,984.8
16,"Houston, TX",119.1853,107672.0,903.4
22,"Lexington, KY",118.389201,105248.0,889.0
19,"Tucson, AZ",117.885821,107170.0,909.1
4,"Dayton, OH",116.121515,111616.0,961.2
14,"Austin, TX",115.996169,108990.0,939.6
20,"Salt Lake City, UT",115.823254,106291.0,917.7




In [22]:
# prompt: using original csv file, lets generate a ranking of the Number of Software Developer Jobs in each city, showing in the right column Number of Software Developer Jobs in each city

# Generate a ranking of the Number of Software Developer Jobs in each city
job_ranking = data.groupby('City')['Number of Software Developer Jobs'].sum().sort_values(ascending=False).reset_index()
job_ranking.rename(columns={'Number of Software Developer Jobs': 'Number of Software Developer Jobs in each city'}, inplace=True)

print("\nRanking of Cities by Number of Software Developer Jobs:")
job_ranking


Ranking of Cities by Number of Software Developer Jobs:


Unnamed: 0,City,Number of Software Developer Jobs in each city
0,"Jersey City, NJ",98650.0
1,"New York, NY",98650.0
2,"Santa Clara, CA",78730.0
3,"San Jose, CA",78730.0
4,"Seattle, WA",65760.0
...,...,...
72,"Eugene, OR",1220.0
73,"Wichita, KS",1220.0
74,"Fort Collins, CO",1190.0
75,"Lexington, KY",1140.0


In [27]:
# prompt: Using dataframe job_ranking: now lets take tgose top 15 and put them on a map, try different library

import plotly.express as px

# Sort the dataframe by the number of jobs and select the top 15 cities
top_cities = job_ranking.nlargest(15, 'Number of Software Developer Jobs in each city')

# Split the 'City' column into 'City' and 'State'
top_cities[['City_Name', 'State']] = top_cities['City'].str.split(', ', expand=True)

# Create a scatter map using Plotly Express
fig = px.scatter_geo(top_cities,
                     locations="State",
                     locationmode="USA-states",
                     size="Number of Software Developer Jobs in each city",
                     hover_name="City",
                     scope="usa",
                     title="Top 15 Cities by Number of Software Developer Jobs")

# Show the map
fig.show()

In [36]:
import plotly.express as px
import pandas as pd

# Assuming your `top_cities` DataFrame is ready from previous code
# And `rent_df` has been loaded with state rent info

# --- 1. Merge Rent Data with top_cities ---
top_cities[['City_Name', 'State']] = top_cities['City'].str.split(', ', expand=True)
merged_df = pd.merge(top_cities, rent_df, on='State', how='left')

# --- 2. Base Map with Rent as Heatmap Coloring ---
fig = px.choropleth(locations=rent_df['State'],
                    locationmode="USA-states",
                    color=rent_df['Avg Rent ($)'],
                    scope="usa",
                    color_continuous_scale="Reds",
                    labels={'color':'Avg Rent ($)'},
                    title="Top 15 Cities by Dev Jobs + Rent Heatmap")

# --- 3. Overlay City Scatter on Top ---
fig.add_scattergeo(locations=merged_df['State'],
                   locationmode="USA-states",
                   text=merged_df['City'],
                   hovertext=merged_df['City'],
                   marker=dict(size=merged_df['Number of Software Developer Jobs in each city'] / 2000,
                               color='blue',
                               line_width=0.5),
                   name="Cities")

fig.show()
