In [2]:
import os
import pandas as pd


base_dir = "./pgm"
dataframes = {}

for year in range(1990, 2022):
    year_dir = os.path.join(base_dir, f"year={year}/")
    
    if os.path.exists(year_dir):
        year_dataframes = []
        
        for file in os.listdir(year_dir):
            if file.endswith(".parquet"):
                file_path = os.path.join(year_dir, file)
                
                df = pd.read_parquet(file_path)
                
                year_dataframes.append(df)
        
        if year_dataframes:
            dataframes[year] = pd.concat(year_dataframes, ignore_index=True)



In [6]:
dataframes[1990]

Unnamed: 0,month_id,priogrid_gid,ged_sb,ged_os,ged_ns,ln_pop_gpw_sum,decay_ged_sb_1,decay_ged_sb_25,decay_ged_os_1,decay_ged_sb_5,...,treelag_1_os,treelag_2_ns,treelag_2_os,sptime_dist_k1_ged_os,sptime_dist_k1_ged_ns,sptime_dist_k10_ged_os,sptime_dist_k10_ged_ns,sptime_dist_k001_ged_os,sptime_dist_k001_ged_ns,month
0,121,62356,0.0,0.0,0.0,0.000000,0.031250,0.031250,0.031250,0.000977,...,1.458139,0.020930,0.018720,17.471405,17.670597,18.384776,17.670597,17.442766,17.240986,1
1,121,79599,0.0,0.0,0.0,8.266445,0.031250,0.031250,0.031250,0.000977,...,2.808539,0.069700,0.291442,1.414214,11.180340,1.414214,11.704700,0.708237,10.440383,1
2,121,79600,0.0,0.0,0.0,7.805237,0.031250,0.031250,0.031250,0.000977,...,2.794594,0.074908,0.276007,1.802776,10.735455,1.802776,11.236103,1.118749,9.962510,1
3,121,79601,0.0,0.0,0.0,9.335159,0.031250,0.031250,0.031250,0.000977,...,2.657515,0.080730,0.180976,2.236068,10.295630,2.236068,10.770330,1.581645,9.486917,1
4,121,80317,0.0,0.0,0.0,12.654427,0.031250,0.031250,0.031250,0.000977,...,4.099017,0.062183,2.078886,0.500000,11.968709,0.500000,12.500000,0.500000,11.280585,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157315,132,190496,0.0,0.0,0.0,10.540696,0.022745,0.022745,0.022745,0.000517,...,4.333877,0.023384,0.064493,12.206556,11.726039,12.206556,32.070235,10.062708,11.313925,12
157316,132,190507,0.0,0.0,0.0,6.973800,0.022745,0.022745,0.022745,0.000517,...,4.338886,0.024076,0.069406,9.069179,9.233093,9.433981,31.245000,5.701587,8.381820,12
157317,132,190508,0.0,0.0,0.0,4.886065,0.022745,0.022745,0.022745,0.000517,...,4.335440,0.024107,0.069823,8.660254,9.137833,9.013878,31.216983,5.409076,8.246508,12
157318,132,190510,0.0,0.0,0.0,8.030275,0.022745,0.022745,0.022745,0.000517,...,4.326135,0.024154,0.070621,7.874008,9.027735,8.200610,31.184932,4.925251,8.062562,12


In [3]:
df = pd.concat(dataframes.values(), ignore_index=True)

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.impute import SimpleImputer

X = df.drop(columns=['ged_sb'])  
y = df['ged_sb']

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

X = pd.DataFrame(X_imputed, columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
top_15_features = feature_importances.nlargest(15)
print(top_15_features)



KeyboardInterrupt: 

In [None]:
import seaborn as sns


plt.figure(figsize=(10, 6))
sns.barplot(x=top_15_features.values, y=top_15_features.index)
plt.title('Top 15 Important Features')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()
