In [1]:
# import necessary libraries
import census
import us
from typing import Dict, List
import pandas as pd
from requests.exceptions import ConnectionError, Timeout



In [6]:
# user input for api key
api_key = input("Enter your API key: ")

# Define the API key
c = census.Census(api_key)

In [7]:
def get_census_data(c, series_code: str, dataset: str = 'acs1', geo_level: str = 'state') -> pd.DataFrame:
    """
    Gets census data for all available years between 2010-2024.
    
    Parameters:
    -----------
    c : Census client object
    series_code : str
        The census series code to fetch
    dataset : str
        Census dataset to query (e.g., 'acs1', 'acs5', 'sf1', etc.)
    geo_level : str
        Geographic level for data ('state' or 'county')
        
    Returns:
    --------
    pd.DataFrame with columns:
        - id: numeric state FIPS or combined state+county FIPS
        - value: the requested census value
        - year: year of observation
    """
    data_rows = []
    
    for year in range(2010, 2024):
        try:
            census_dataset = getattr(c, dataset)
            
            if geo_level == 'state':
                data = census_dataset.get(series_code, {'for': 'state:*'}, year=year)
                for row in data:
                    data_rows.append({
                        'id': int(row['state']),
                        'value': row[series_code],
                        'year': year
                    })
            else:  # county level
                data = census_dataset.get(
                    series_code,
                    {'for': 'county:*', 'in': 'state:*'},
                    year=year
                )
                for row in data:
                    # Create combined FIPS code: state (2 digits) + county (3 digits)
                    fips = int(str(row['state']).zfill(2) + str(row['county']).zfill(3))
                    data_rows.append({
                        'id': fips,
                        'value': row[series_code],
                        'year': year
                    })
                
        except Exception as e:
            print(f"Failed to fetch {year} data from {dataset}: {str(e)}")
            
    df = pd.DataFrame(data_rows)
    df['value'] = pd.to_numeric(df['value'])
    return df

In [8]:
population_state = get_census_data(c, 'B01001_001E', dataset='acs5', geo_level='state')
population_state

Unnamed: 0,id,value,year
0,1,4712651.0,2010
1,2,691189.0,2010
2,4,6246816.0,2010
3,5,2872684.0,2010
4,6,36637290.0,2010
...,...,...,...
723,53,7740984.0,2023
724,54,1784462.0,2023
725,55,5892023.0,2023
726,56,579761.0,2023


In [13]:
unemployment_state = c.acs5.state(
    fields=(
        'NAME',
        'B23025_005E',
        'B23025_002E'
        ),
    state_fips='*',
    year=2022
)

# Create a DataFrame from the API response
df = pd.DataFrame(unemployment_state)
df.columns = ['state', 'unemployed', 'labor_force', 'id']
df

Unnamed: 0,state,unemployed,labor_force,id
0,Alabama,120030.0,2345086.0,1
1,Alaska,23035.0,383078.0,2
2,Arizona,186058.0,3490030.0,4
3,Arkansas,71601.0,1397075.0,5
4,California,1282055.0,20168662.0,6
5,Colorado,143483.0,3205413.0,8
6,Connecticut,115050.0,1955341.0,9
7,Delaware,27315.0,505728.0,10
8,District of Columbia,28264.0,400930.0,11
9,Florida,531896.0,10629693.0,12


In [14]:
df['unemp_rate'] = df['unemployed'] / df['labor_force']

df

Unnamed: 0,state,unemployed,labor_force,id,unemp_rate
0,Alabama,120030.0,2345086.0,1,0.051184
1,Alaska,23035.0,383078.0,2,0.060131
2,Arizona,186058.0,3490030.0,4,0.053311
3,Arkansas,71601.0,1397075.0,5,0.051251
4,California,1282055.0,20168662.0,6,0.063567
5,Colorado,143483.0,3205413.0,8,0.044763
6,Connecticut,115050.0,1955341.0,9,0.058839
7,Delaware,27315.0,505728.0,10,0.054011
8,District of Columbia,28264.0,400930.0,11,0.070496
9,Florida,531896.0,10629693.0,12,0.050039


In [17]:
# make object integer
df['id'] = df['id'].astype(int)

In [18]:
df.to_csv('data/us_unemployment_rate.csv', index=False)

## NYC Languages

In [2]:
import requests

# Define the API endpoint
url = "https://data.cityofnewyork.us/resource/ajin-gkbp.json"

# Fetch the data
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    # Convert the data to a DataFrame
    nyc_languages_df = pd.DataFrame(data)
    print(nyc_languages_df.head())
else:
    print(f"Failed to fetch data: {response.status_code}")

  acs_5_year_data_time_period    borough borough_cd_code  \
0                   2015-2019  Manhattan             101   
1                   2015-2019  Manhattan             101   
2                   2015-2019  Manhattan             101   
3                   2015-2019  Manhattan             101   
4                   2015-2019  Manhattan             101   

      community_district_name          language lep_population_estimate  \
0  Battery Park City, Tribeca         Afrikaans                       0   
1  Battery Park City, Tribeca  Akan (incl. Twi)                       0   
2  Battery Park City, Tribeca          Albanian                       7   
3  Battery Park City, Tribeca   Aleut languages                       0   
4  Battery Park City, Tribeca           Amharic                       0   

  of_lep_population cvalep_population_estimate of_cvalep_population  
0                 0                          0                    0  
1                 0                          0  

In [4]:
# turn response into a dataframe
nyc_languages_df = pd.DataFrame(data)

nyc_languages_df

Unnamed: 0,acs_5_year_data_time_period,borough,borough_cd_code,community_district_name,language,lep_population_estimate,of_lep_population,cvalep_population_estimate,of_cvalep_population
0,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Afrikaans,0,0,0,0
1,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Akan (incl. Twi),0,0,0,0
2,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Albanian,7,0.2,0,0
3,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Aleut languages,0,0,0,0
4,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Amharic,0,0,0,0
...,...,...,...,...,...,...,...,...,...
995,2015-2019,Manhattan,108,Upper East Side,Hebrew,156,1.7,57,1.4
996,2015-2019,Manhattan,108,Upper East Side,Hindi,34,0.4,0,0
997,2015-2019,Manhattan,108,Upper East Side,Hmong,0,0,0,0
998,2015-2019,Manhattan,108,Upper East Side,Hungarian,17,0.2,17,0.4


In [9]:
nyc_languages_df = pd.read_csv('data/nyc_languages.csv')

nyc_languages_df

Unnamed: 0,American Community Survey (ACS) Data Time Period,Borough,Borough Community District Code,Community District Name,Language,LEP Population (Estimate),% of LEP Population,CVALEP Population (Estimate),% of CVALEP Population
0,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Afrikaans,0,0.0,0,0.0
1,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Akan (incl. Twi),0,0.0,0,0.0
2,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Albanian,7,0.2,0,0.0
3,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Aleut languages,0,0.0,0,0.0
4,2015-2019,Manhattan,101,"Battery Park City, Tribeca",Amharic,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...
8019,2015-2019,Staten Island,503,"Tottenville, Woodrow, Great Kills",Vietnamese,0,0.0,0,0.0
8020,2015-2019,Staten Island,503,"Tottenville, Woodrow, Great Kills",Wolof,0,0.0,0,0.0
8021,2015-2019,Staten Island,503,"Tottenville, Woodrow, Great Kills",Yiddish,0,0.0,0,0.0
8022,2015-2019,Staten Island,503,"Tottenville, Woodrow, Great Kills",Yoruba,0,0.0,0,0.0


In [10]:
nyc_languages_df.columns

Index(['American Community Survey (ACS) Data Time Period', 'Borough',
       'Borough Community District Code', 'Community District Name',
       'Language', 'LEP Population (Estimate)', '% of LEP Population',
       'CVALEP Population (Estimate)', '% of CVALEP Population'],
      dtype='object')

In [11]:
nyc_languages_df = nyc_languages_df[['Borough', 'Borough Community District Code', 'Community District Name', 'Language', '% of LEP Population']]

nyc_languages_df.columns = ['borough', 'id', 'name', 'language', 'pct']

nyc_languages_df

Unnamed: 0,borough,id,name,language,pct
0,Manhattan,101,"Battery Park City, Tribeca",Afrikaans,0.0
1,Manhattan,101,"Battery Park City, Tribeca",Akan (incl. Twi),0.0
2,Manhattan,101,"Battery Park City, Tribeca",Albanian,0.2
3,Manhattan,101,"Battery Park City, Tribeca",Aleut languages,0.0
4,Manhattan,101,"Battery Park City, Tribeca",Amharic,0.0
...,...,...,...,...,...
8019,Staten Island,503,"Tottenville, Woodrow, Great Kills",Vietnamese,0.0
8020,Staten Island,503,"Tottenville, Woodrow, Great Kills",Wolof,0.0
8021,Staten Island,503,"Tottenville, Woodrow, Great Kills",Yiddish,0.0
8022,Staten Island,503,"Tottenville, Woodrow, Great Kills",Yoruba,0.0


In [15]:
# divide the percentage by 100
nyc_languages_df['pct'] = nyc_languages_df['pct'] / 100

nyc_languages_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyc_languages_df['pct'] = nyc_languages_df['pct'] / 100


Unnamed: 0,borough,id,name,language,pct
0,Manhattan,101,"Battery Park City, Tribeca",Afrikaans,0.000
1,Manhattan,101,"Battery Park City, Tribeca",Akan (incl. Twi),0.000
2,Manhattan,101,"Battery Park City, Tribeca",Albanian,0.002
3,Manhattan,101,"Battery Park City, Tribeca",Aleut languages,0.000
4,Manhattan,101,"Battery Park City, Tribeca",Amharic,0.000
...,...,...,...,...,...
8019,Staten Island,503,"Tottenville, Woodrow, Great Kills",Vietnamese,0.000
8020,Staten Island,503,"Tottenville, Woodrow, Great Kills",Wolof,0.000
8021,Staten Island,503,"Tottenville, Woodrow, Great Kills",Yiddish,0.000
8022,Staten Island,503,"Tottenville, Woodrow, Great Kills",Yoruba,0.000


In [20]:
# Group by 'id' and find the third highest 'pct' value for each group
third_highest_pct = nyc_languages_df.groupby('id')['pct'].nlargest(3).groupby(level=0).nth(2).reset_index()

# add the 'borough', 'name', and 'language' columns
third_highest_pct = third_highest_pct.merge(nyc_languages_df, on=['id', 'pct'], how='inner')

# drop duplicates to ensure only one language is included in case of a tie
third_highest_pct = third_highest_pct.drop_duplicates(subset=['id'])

third_highest_pct

Unnamed: 0,id,pct,borough,name,language
0,101,0.049,Manhattan,"Battery Park City, Tribeca",Russian
1,102,0.049,Manhattan,"Greenwich Village, Soho",Russian
2,103,0.009,Manhattan,"Lower East Side, Chinatown",Korean
4,104,0.08,Manhattan,"Chelsea, Clinton",Japanese
5,105,0.08,Manhattan,Midtown Business District,Japanese
6,106,0.156,Manhattan,"Stuyvesant Town, Turtle Bay",Korean
7,107,0.044,Manhattan,"West Side, Upper West Side",Russian
8,108,0.103,Manhattan,Upper East Side,Japanese
9,109,0.014,Manhattan,"Manhattanville, Hamilton Heights",French
10,110,0.078,Manhattan,Central Harlem,"Chinese (incl. Mandarin, Cantonese)"


In [19]:
third_highest_pct['language'].value_counts()

Chinese (incl. Mandarin, Cantonese)    10
Bengali                                 8
Russian                                 6
French                                  6
Arabic                                  5
Spanish                                 5
Japanese                                4
Akan (incl. Twi)                        3
Korean                                  3
Haitian                                 3
Polish                                  2
Hebrew                                  1
Punjabi                                 1
Albanian                                1
Yiddish                                 1
Min Nan Chinese                         1
Italian                                 1
Name: language, dtype: int64

In [29]:
third_highest_pct.id.nunique()

59

In [27]:
third_highest_pct['language'] = third_highest_pct['language'].str.split('(').str[0].str.strip()

third_highest_pct

Unnamed: 0,id,pct,borough,name,language
0,101,0.049,Manhattan,"Battery Park City, Tribeca",Russian
1,102,0.049,Manhattan,"Greenwich Village, Soho",Russian
2,103,0.009,Manhattan,"Lower East Side, Chinatown",Korean
4,104,0.08,Manhattan,"Chelsea, Clinton",Japanese
5,105,0.08,Manhattan,Midtown Business District,Japanese
6,106,0.156,Manhattan,"Stuyvesant Town, Turtle Bay",Korean
7,107,0.044,Manhattan,"West Side, Upper West Side",Russian
8,108,0.103,Manhattan,Upper East Side,Japanese
9,109,0.014,Manhattan,"Manhattanville, Hamilton Heights",French
10,110,0.078,Manhattan,Central Harlem,Chinese


In [22]:
third_highest_pct.dtypes

id            int64
pct         float64
borough      object
name         object
language     object
dtype: object

In [28]:
# save to csv
third_highest_pct.to_csv('data/nyc_languages.csv', index=False)