In [11]:
import pandas as pd
import numpy as np
import pandas as pd



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

In [13]:
from tqdm import tqdm

In [14]:
%run preprocessing.ipynb


In [15]:
data = master_df

data.shape

(734160, 107)

In [16]:
df = pd.DataFrame(data)

# Convert month_id to datetime
def convert_month_id_to_datetime(month_id):
    base_year = 1990
    base_month = 1  # January is month 1
    year = (month_id - 121) // 12 + base_year
    month = (month_id - 121) % 12 + base_month
    return pd.to_datetime({'year': year, 'month': month, 'day': np.ones_like(year)})

df['date'] = convert_month_id_to_datetime(df['month_id'])


In [17]:
from sklearn.preprocessing import StandardScaler

# Assuming continuous features need scaling
features_to_scale = ['ged_sb', 'ged_os', 'ged_ns']
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])



In [18]:
def batch_feature_importance(df, target_col='ged_sb', batch_size=20):
    # Temporarily remove the date column if it's not needed for the analysis
    if 'date' in df.columns:
        date_series = df['date'].copy()
        df.drop('date', axis=1, inplace=True)

    # Define the target and prepare the DataFrame by selecting only numeric columns
    y = df[target_col].astype(int)  # Ensure the target variable is suitable for classification
    X = df.select_dtypes(include=[np.number])  # Exclude non-numeric columns to avoid errors

    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()

    feature_importances = pd.DataFrame()
    feature_importances['Feature'] = X.columns
    feature_importances['Importance'] = 0

    # Calculate feature importance in batches
    n_features = X.shape[1]
    for start in tqdm(range(0, n_features, batch_size)):
        end = start + batch_size
        # Select a batch of features
        batch_features = X.columns[start:end]
        X_batch = X[batch_features]

        # Handle missing values and scale the data
        X_batch_imputed = imputer.fit_transform(X_batch)
        X_batch_scaled = scaler.fit_transform(X_batch_imputed)

        # Fit the RandomForest classifier
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_batch_scaled, y)

        # Store the importances for this batch
        feature_importances.loc[start:end-1, 'Importance'] += clf.feature_importances_

    # Normalize the importance values to sum to 1
    feature_importances['Importance'] /= feature_importances['Importance'].sum()

    # Sort features by their importance
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False).reset_index(drop=True)


    if 'date' not in df.columns:
        df['date'] = date_series

    return feature_importances

In [19]:

important_features = batch_feature_importance(df, 'ged_sb', 20)
print(important_features.head(50))  



100%|██████████| 6/6 [08:40<00:00, 86.83s/it] 

                       Feature  Importance
0                       ged_sb    0.112779
1               decay_ged_ns_1    0.097708
2      sptime_dist_k001_ged_ns    0.052643
3      sptime_dist_k001_ged_os    0.030391
4       sptime_dist_k10_ged_ns    0.028549
5               splag_1_1_sb_1    0.024663
6       splag_1_decay_ged_sb_1    0.024447
7        sptime_dist_k1_ged_ns    0.024203
8           wdi_nv_agr_totl_kd    0.023487
9                    ged_gte_1    0.020078
10  ged_sb_decay_12_time_since    0.019090
11             spei_48_detrend    0.018597
12      sptime_dist_k10_ged_os    0.017343
13           spei1_gsm_detrend    0.016198
14       mov_sum_6_ged_best_sb    0.015805
15       mov_avg_6_ged_best_sb    0.015770
16     sptime_dist_k001_ged_sb    0.015466
17                treelag_1_ns    0.015324
18                treelag_2_os    0.015320
19                treelag_1_os    0.015261
20                treelag_2_ns    0.015132
21       sptime_dist_k1_ged_os    0.013537
22       sp




In [20]:
top_50_features = important_features.head(50)

In [24]:
# Assuming the DataFrame includes a 'date' and 'priogrid_gid' for sorting and grouping
# Filter the DataFrame to keep only the top 50 features plus the target
df_filtered = df[top_50_features]


grouped = df_filtered.sort_values(by='date').groupby('priogrid_gid')
transformer_input = {gid: grp.drop(columns=['priogrid_gid']).values for gid, grp in grouped}



In [25]:
# Print the shapes of each array in the transformer_input dictionary
for gid, array in transformer_input.items():
    print(f'Shape of array for priogrid_gid {gid}: {array.shape}')
