In [2]:
import pandas as pd
filepath = '../raw_data/dynamism_data/bds2022_gr_vcn3.csv'
df = pd.read_csv(filepath, encoding='latin1', low_memory=False)

In [4]:
print("Number of NaN values in fempgr_gr:", df['fempgr_gr'].isna().sum())
print("Total rows:", len(df))

print("Data type of fempgr_gr:", df['fempgr_gr'].dtype)
print("\nFirst 10 values of fempgr_gr:")
print(df['fempgr_gr'].head(10))

print("\nAny non-NaN values?", df['fempgr_gr'].notna().any())

Number of NaN values in fempgr_gr: 0
Total rows: 34830
Data type of fempgr_gr: object

First 10 values of fempgr_gr:
0    a) -2
1    a) -2
2    a) -2
3    a) -2
4    a) -2
5    a) -2
6    a) -2
7    a) -2
8    a) -2
9    a) -2
Name: fempgr_gr, dtype: object

Any non-NaN values? True


In [5]:
df['fempgr_gr'].unique()

array(['a) -2', 'b) (-2 to -0.8]', 'c) (-0.8 to -0.2]',
       'd) (-0.2 to -0.01]', 'e) (-0.01 to 0.01)', 'f) [0.01 to 0.2)',
       'g) [0.2 to 0.8)', 'h) [0.8 to 2)', 'i) 2'], dtype=object)

In [7]:
df['emp'] = pd.to_numeric(df['emp'], errors='coerce')
df['is_high_growth'] = df['fempgr_gr'].isin(['h) [0.8 to 2)', 'i) 2']).astype(int)
df['emp_high_growth'] = df['emp'] * df['is_high_growth']

hgfs_by_industry = df.groupby(['year', 'vcnaics3']).agg({
    'emp_high_growth': 'sum',
    'emp': 'sum'
}).reset_index()

hgfs_by_industry['pct_high_growth_emp'] = (hgfs_by_industry['emp_high_growth'] / hgfs_by_industry['emp']) * 100
hgfs_by_industry = hgfs_by_industry[['year', 'vcnaics3', 'pct_high_growth_emp']]
hgfs_by_industry = hgfs_by_industry.sort_values(['year', 'vcnaics3']).reset_index(drop=True)

print(hgfs_by_industry.head(10))
print(f"\nDataframe shape: {hgfs_by_industry.shape}")

   year  vcnaics3  pct_high_growth_emp
0  1978       113            17.914801
1  1978       114            34.516995
2  1978       115            20.457569
3  1978       211             7.053673
4  1978       212             5.547968
5  1978       213            12.482739
6  1978       221             0.485486
7  1978       236            25.748477
8  1978       237            14.338048
9  1978       238            20.385673

Dataframe shape: (3870, 3)


In [8]:
output_path = '../processed_data/hgfs_by_industry.csv'
hgfs_by_industry.to_csv(output_path, index=False)

print(f"CSV file saved to: {output_path}")

CSV file saved to: ../processed_data/hgfs_by_industry.csv
