<a href="https://colab.research.google.com/github/florivz/DDM-Project-WS24-25/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from google.colab import userdata
from sklearn.cluster import KMeans
from matplotlib.patches import Patch
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import TruncatedSVD

In [2]:
token = userdata.get("DDM_KEY")

!git clone https://{token}@github.com/florivz/DDM-Project-WS24-25.git

Cloning into 'DDM-Project-WS24-25'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 17 (delta 2), reused 13 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (17/17), 28.44 MiB | 12.33 MiB/s, done.
Resolving deltas: 100% (2/2), done.


# Loading the Data

In [3]:
pkl_path = 'DDM-Project-WS24-25/pkl/'

In [4]:
gdp_data = pd.read_pickle(pkl_path + "gdp.pickle")
df_pattern = pd.read_pickle(pkl_path + "naics_pattern.pickle")
df_occupation_1 = pd.read_pickle(pkl_path + "naics_occupation_part1.pickle")
df_occupation_2 = pd.read_pickle(pkl_path + "naics_occupation_part2.pickle")
df_occupation_3 = pd.read_pickle(pkl_path + "naics_occupation_part3.pickle")
df_county = pd.read_pickle(pkl_path + "county.pickle")

# Transforming the data

In [5]:
# def add_zeros(code):
#     if len(code) == 3:
#         return '00' + code
#     elif len(code) == 4:
#         return '0' + code
#     elif len(code) == 1:
#         return '0000' + code
#     return code

In [6]:
# Filling 0 to get same length for all FIPS
# df_pattern['FIPS'] = df_pattern['FIPS'].astype(str)
# unique_lengths = df_pattern['FIPS'].apply(len).unique()
# unique_lengths

# df_pattern['FIPS'] = df_pattern['FIPS'].apply(add_zeros)

In [7]:
# Merging Occupation Datasets
df_occupation = pd.concat([df_occupation_1, df_occupation_2, df_occupation_3], ignore_index=True)
df_occupation.head(5)

Unnamed: 0,FIPS,State_GEOID,naics,NAICS_TITLE,emp_total_county_naics,OCC_CODE,OCC_TITLE,emp_occupation,state_name
0,12999,12,5613,Employment Services,1436559,49-9071,"Maintenance and Repair Workers, General",20639.514235,
1,6999,6,5613,Employment Services,729335,49-9071,"Maintenance and Repair Workers, General",9414.167765,
2,36999,36,5613,Employment Services,308333,49-9071,"Maintenance and Repair Workers, General",8332.850279,
3,6037,6,8111,Automotive Repair and Maintenance,25081,49-3023,Automotive Service Technicians and Mechanics,5913.423292,California
4,48999,48,5613,Employment Services,340926,49-9071,"Maintenance and Repair Workers, General",5770.378034,


In [8]:
# Keeping only the necessary columns

# GDP
df_gdp_filtered = gdp_data.drop(gdp_data[['TableName', 'LineCode', 'Unit', 'Region', '2017', '2018', '2019', '2020', '2021', 'GeoName']], axis=1)
df_gdp_filtered = df_gdp_filtered.rename(columns={'IndustryClassification': 'naics', '2022': 'current_gdp'})

df_gdp_filtered['FIPS'] = df_gdp_filtered['FIPS'].astype(str)
df_gdp_filtered = df_gdp_filtered[df_gdp_filtered['FIPS'].str.len() == 5] # Filtering for FIPS with 5 numbers --> need it on county level

# Naics Pattern
df_pattern_filtered = df_pattern.drop(columns=['emp_nf', 'qp1_nf', 'qp1', 'ap_nf',
       'n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499',
       'n500_999', 'n1000', 'n1000_1', 'n1000_2', 'n1000_3', 'n1000_4', 'est', 'naics_2', 'State_GEOID',	'County_GEOID'])
df_pattern_filtered = df_pattern_filtered.rename(columns={'DESCRIPTION': 'naics_describtion'})

df_pattern_filtered['FIPS'] = df_pattern_filtered['FIPS'].astype(str)
df_pattern_filtered = df_pattern_filtered[df_pattern_filtered['FIPS'].str.len() == 5] # Filtering for FIPS with 5 numbers

# Naics Occupation
df_occupation_filtered = df_occupation.drop(columns=['State_GEOID', 'NAICS_TITLE', 'emp_total_county_naics', 'state_name'])

df_occupation_filtered['FIPS'] = df_occupation_filtered['FIPS'].astype(str)
df_occupation_filtered = df_occupation_filtered[df_occupation_filtered['FIPS'].str.len() == 5] # Filtering for FIPS with 5 numbers

In [9]:
# Define the relevant NAICS code prefixes as strings
relevant_naics_prefixes = ['21', '23', '31', '32', '33']  # Sectors for Mining, Construction, Manufacturing

# Convert relevant prefixes to a tuple for startswith
relevant_naics_prefixes = tuple(relevant_naics_prefixes)

# Filter df_gdp_filtered
df_gdp_filtered = df_gdp_filtered[
    df_gdp_filtered['naics'].astype(str).str.startswith(relevant_naics_prefixes)
]

# Filter df_pattern_filtered
df_pattern_filtered = df_pattern_filtered[
    df_pattern_filtered['naics'].astype(str).str.startswith(relevant_naics_prefixes)
]

# Filter df_occupation_filtered
df_occupation_filtered = df_occupation_filtered[
    df_occupation_filtered['naics'].astype(str).str.startswith(relevant_naics_prefixes)
]


# Data Preprocessing

In [10]:
# Merge df_gdp_filtered and df_pattern_filtered on FIPS and naics
df_merged = df_gdp_filtered.merge(
    df_pattern_filtered,
    on=['FIPS', 'naics'],
    how='left',
    suffixes=('_gdp', '_pattern')
)

# Merge the result with df_occupation_filtered on FIPS and naics
df_merged = df_merged.merge(
    df_occupation_filtered,
    on=['FIPS', 'naics'],
    how='left',
    suffixes=('', '_occupation')
)


In [11]:
df_merged.head()

Unnamed: 0,FIPS,naics,Description,current_gdp,naics_describtion,emp,ap,OCC_CODE,OCC_TITLE,emp_occupation
0,10000,21,"Mining, quarrying, and oil and gas extraction",6809.0,,,,,,
1,10000,23,Construction,2523911.0,,,,,,
2,10000,31-33,Manufacturing,5239870.0,,,,,,
3,10000,"321,327-339",Durable goods manufacturing,1436297.0,,,,,,
4,10000,"311-316,322-326",Nondurable goods manufacturing,3809665.0,,,,,,
