In [11]:
import pandas as pd
import numpy as np
import pandas as pd



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

In [13]:
from tqdm import tqdm

In [14]:
%run preprocessing.ipynb


In [15]:
data = master_df

data.shape

(734160, 107)

In [16]:
df = pd.DataFrame(data)

# Convert month_id to datetime
def convert_month_id_to_datetime(month_id):
    base_year = 1990
    base_month = 1  # January is month 1
    year = (month_id - 121) // 12 + base_year
    month = (month_id - 121) % 12 + base_month
    return pd.to_datetime({'year': year, 'month': month, 'day': np.ones_like(year)})

df['date'] = convert_month_id_to_datetime(df['month_id'])


In [17]:
from sklearn.preprocessing import StandardScaler

# Assuming continuous features need scaling
features_to_scale = ['ged_sb', 'ged_os', 'ged_ns']
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])



In [18]:
def batch_feature_importance(df, target_col='ged_sb', batch_size=20):
    # Temporarily remove the date column if it's not needed for the analysis
    if 'date' in df.columns:
        date_series = df['date'].copy()
        df.drop('date', axis=1, inplace=True)

    # Define the target and prepare the DataFrame by selecting only numeric columns
    y = df[target_col].astype(int)  # Ensure the target variable is suitable for classification
    X = df.select_dtypes(include=[np.number])  # Exclude non-numeric columns to avoid errors

    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()

    feature_importances = pd.DataFrame()
    feature_importances['Feature'] = X.columns
    feature_importances['Importance'] = 0

    # Calculate feature importance in batches
    n_features = X.shape[1]
    for start in tqdm(range(0, n_features, batch_size)):
        end = start + batch_size
        # Select a batch of features
        batch_features = X.columns[start:end]
        X_batch = X[batch_features]

        # Handle missing values and scale the data
        X_batch_imputed = imputer.fit_transform(X_batch)
        X_batch_scaled = scaler.fit_transform(X_batch_imputed)

        # Fit the RandomForest classifier
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_batch_scaled, y)

        # Store the importances for this batch
        feature_importances.loc[start:end-1, 'Importance'] += clf.feature_importances_

    # Normalize the importance values to sum to 1
    feature_importances['Importance'] /= feature_importances['Importance'].sum()

    # Sort features by their importance
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False).reset_index(drop=True)


    if 'date' not in df.columns:
        df['date'] = date_series

    return feature_importances

In [19]:

important_features = batch_feature_importance(df, 'ged_sb', 20)
print(important_features.head(50))  



100%|██████████| 6/6 [08:40<00:00, 86.83s/it] 

                       Feature  Importance
0                       ged_sb    0.112779
1               decay_ged_ns_1    0.097708
2      sptime_dist_k001_ged_ns    0.052643
3      sptime_dist_k001_ged_os    0.030391
4       sptime_dist_k10_ged_ns    0.028549
5               splag_1_1_sb_1    0.024663
6       splag_1_decay_ged_sb_1    0.024447
7        sptime_dist_k1_ged_ns    0.024203
8           wdi_nv_agr_totl_kd    0.023487
9                    ged_gte_1    0.020078
10  ged_sb_decay_12_time_since    0.019090
11             spei_48_detrend    0.018597
12      sptime_dist_k10_ged_os    0.017343
13           spei1_gsm_detrend    0.016198
14       mov_sum_6_ged_best_sb    0.015805
15       mov_avg_6_ged_best_sb    0.015770
16     sptime_dist_k001_ged_sb    0.015466
17                treelag_1_ns    0.015324
18                treelag_2_os    0.015320
19                treelag_1_os    0.015261
20                treelag_2_ns    0.015132
21       sptime_dist_k1_ged_os    0.013537
22       sp




In [31]:
top_50_features = important_features.head(50)['Feature'].tolist()

In [32]:
columns_needed = top_50_features + ['date', 'priogrid_gid']
df_filtered = df[columns_needed]

# Group by 'priogrid_gid' and sort by 'date' within each group
grouped = df_filtered.sort_values(by='date').groupby('priogrid_gid')

# Create a dictionary with each group's data, dropping 'priogrid_gid' since it's used as the key
transformer_input = {gid: grp.drop(columns=['priogrid_gid']).values for gid, grp in grouped}


In [33]:
# Print the shapes of each array in the transformer_input dictionary
for gid, array in transformer_input.items():
    print(f'Shape of array for priogrid_gid {gid}: {array.shape}')


Shape of array for priogrid_gid 62356: (56, 51)
Shape of array for priogrid_gid 79599: (56, 51)
Shape of array for priogrid_gid 79600: (56, 51)
Shape of array for priogrid_gid 79601: (56, 51)
Shape of array for priogrid_gid 80317: (56, 51)
Shape of array for priogrid_gid 80318: (56, 51)
Shape of array for priogrid_gid 80319: (56, 51)
Shape of array for priogrid_gid 80320: (56, 51)
Shape of array for priogrid_gid 80321: (56, 51)
Shape of array for priogrid_gid 80322: (56, 51)
Shape of array for priogrid_gid 80323: (56, 51)
Shape of array for priogrid_gid 80324: (56, 51)
Shape of array for priogrid_gid 80325: (56, 51)
Shape of array for priogrid_gid 80326: (56, 51)
Shape of array for priogrid_gid 80327: (56, 51)
Shape of array for priogrid_gid 80328: (56, 51)
Shape of array for priogrid_gid 80329: (56, 51)
Shape of array for priogrid_gid 80330: (56, 51)
Shape of array for priogrid_gid 80331: (56, 51)
Shape of array for priogrid_gid 80332: (56, 51)
Shape of array for priogrid_gid 81037: (

In [34]:
# printing length of transformer_input
len(transformer_input)

13110

Encoder block

In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [43]:
# Assuming 'df_filtered' and 'top_50_features' are correctly set up as before
df_filtered = df[columns_needed]

# Correctly creating the transformer_input
grouped = df_filtered.sort_values(by='date').groupby('priogrid_gid')
transformer_input = {gid: grp.values for gid, grp in grouped}


In [45]:
# Assuming grp[:, -2] gives us the date column already as DatetimeIndex or similar
date_encodings = {gid: pd.to_datetime(grp[:, -2]).astype(int) / 10**9 for gid, grp in transformer_input.items()}  

In [46]:
# To inspect first group's datetime handling:
first_gid, first_grp = next(iter(transformer_input.items()))
print(pd.to_datetime(first_grp[:, -2]))  # Check what this outputs to verify correct handling


DatetimeIndex(['2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01',
               '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01',
               '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01',
               '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01',
               '2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-01-01', '2020-02-01', '2020-02-01',
               '2020-03-01', '2020-03-01', '2020-04-01', '2020-04-01',
               '2020-05-01', '2020-05-01', '2020-06-01', '2020-06-01',
               '2020-07-01', '2020-07-01', '2020-08-01', '2020-08-01',
               '2020-09-01', '2020-09-01', '2020-10-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01'],
     

In [47]:
class TransformerDataset(Dataset):
    def __init__(self, data, date_encodings):
        self.data = data
        self.date_encodings = date_encodings
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        gid = list(self.data.keys())[idx]
        features = np.array(self.data[gid][:, :-2], dtype=np.float32)  # Exclude the last two columns if last is target and second last is date
        target = np.array(self.data[gid][:, -1], dtype=np.float32)
        dates = self.date_encodings[gid]
        return torch.tensor(features), torch.tensor(dates), torch.tensor(target)



In [48]:
# Proper DataLoader initialization
dataset = TransformerDataset(transformer_input, date_encodings)

train_set, val_set = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_loader = DataLoader([dataset[i] for i in train_set], batch_size=32, shuffle=True)
val_loader = DataLoader([dataset[i] for i in val_set], batch_size=32, shuffle=False)

In [49]:
# Define the transformer model
class TemporalTransformerEncoder(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        super(TemporalTransformerEncoder, self).__init__()
        self.input_embedding = nn.Linear(input_size, d_model)
        self.positional_encoder = nn.Embedding(int(np.max([np.max(dates) for dates in date_encodings.values()])) + 1, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output_layer = nn.Linear(d_model, 1)
    
    def forward(self, x, dates):
        x = self.input_embedding(x)
        dates = dates.long()
        pos_encoding = self.positional_encoder(dates)
        x += pos_encoding
        x = self.transformer_encoder(x)
        output = self.output_layer(x)
        return output.squeeze(-1)

In [50]:
# Initialize the model
model = TemporalTransformerEncoder(input_size=len(top_50_features), d_model=128, nhead=8, num_layers=2)
criterion = nn.MSELoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

: 

In [None]:
# Training loop
for epoch in tqdm(range(100)):  # Number of epochs
    model.train()
    for features, dates, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(features.squeeze(0), dates.squeeze(0))
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    # Validation loop
    model.eval()
    val_losses = []
    with torch.no_grad():
        for features, dates, targets in val_loader:
            outputs = model(features.squeeze(0), dates.squeeze(0))
            val_loss = criterion(outputs, targets)
            val_losses.append(val_loss.item())
    val_loss_avg = np.mean(val_losses)
    print(f'Epoch {epoch+1}: Train Loss: {loss.item():.4f}, Val Loss: {val_loss_avg:.4f}')
