In [3]:
%pip install faiss-cpu





In [4]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity

# For sentence transformers
from sentence_transformers import SentenceTransformer

# For BERT
from transformers import BertTokenizer, BertModel
import torch

# For FAISS
import faiss

# for TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

## Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/Email_AI/emails.csv")

# Extract folder and user from file path
df["folder"] = (
    df["message"]
    .str.extract(r"(?m)^X-Folder:\s*(.*)$")[0]
)
df["user"] = df["file"].apply(lambda x: x.split("/")[0] + "@enron.com" if "/" in x else "")
df["text"] = df["message"].fillna("")

# Prepare folder documents
folder_docs = (
    df.groupby("folder")["text"]
    .apply(lambda x: " ".join(x))
    .reset_index()
)

print(f"Total emails: {len(df)}")
print(f"Total folders: {len(folder_docs)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total emails: 517401
Total folders: 5335


In [None]:
# print(df["folder"])
print(folder_docs.head())

                                              folder  \
0             \ALEWIS (Non-Privileged)\Deleted Items   
1                     \ALEWIS (Non-Privileged)\Inbox   
2  \ALEWIS (Non-Privileged)\Lewis, Andrew H.\Dele...   
3    \ALEWIS (Non-Privileged)\Lewis, Andrew H.\Inbox   
4  \ALEWIS (Non-Privileged)\Lewis, Andrew H.\Sent...   

                                                text  
0  Message-ID: <30174750.1075851524275.JavaMail.e...  
1  Message-ID: <28768771.1075851537835.JavaMail.e...  
2  Message-ID: <21494743.1075861159379.JavaMail.e...  
3  Message-ID: <30678212.1075861168464.JavaMail.e...  
4  Message-ID: <13015965.1075861168269.JavaMail.e...  


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Filter rare classes
folder_counts = df['folder'].value_counts()
valid_folders = folder_counts[folder_counts >= 20].index
df = df[df['folder'].isin(valid_folders)]

from sklearn.model_selection import train_test_split

# Step 1: Take 60% of data
X_temp, X_unused, y_temp, y_unused = train_test_split(
    df['text'],
    df['folder'],
    test_size=0.4,   # 40% unused
    random_state=42,
    stratify=df['folder']  # recommended for multiclass
)

# Step 2: Split remaining 60% → 40% train, 20% test

X_train, X_test, y_train, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=1/3,
    random_state=42
)

vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.95
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=500)

model.fit(X_train_tfidf, y_train)

preds = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, preds))


# Baseline : Classification Approch

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

# ---------------------------------------------------
# 1️⃣ Model Functions
# ---------------------------------------------------

def logistic_regression_model():
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            max_features=15000,
            ngram_range=(1,2),
            min_df=2
        )),
        ('clf', LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            max_iter=500
        ))
    ])


def linear_svc_model():
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            max_features=15000,
            ngram_range=(1,2),
            min_df=2
        )),
        ('clf', LinearSVC())
    ])


def naive_bayes_model():
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            max_features=10000,
            ngram_range=(1,2),
            min_df=2
        )),
        ('clf', MultinomialNB())
    ])


# ---------------------------------------------------
# 2️⃣ Model Dictionary
# ---------------------------------------------------

models = {
    "Logistic_Regression": logistic_regression_model(),
    "Linear_SVC": linear_svc_model(),
    "Naive_Bayes": naive_bayes_model()
}


# ---------------------------------------------------
# 3️⃣ Training + Evaluation Function
# ---------------------------------------------------

def train_and_evaluate_models(df, text_col='text', target_col='folder'):

    df = df.dropna(subset=[text_col, target_col])

    X = df[text_col]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42
    )

    results = {}

    for name, model in models.items():

        print(f"\nTraining {name}...")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        results[name] = {
            "model": model,
            "accuracy": accuracy_score(y_test, y_pred),
            "f1_macro": f1_score(y_test, y_pred, average='macro'),
            "f1_weighted": f1_score(y_test, y_pred, average='weighted')
        }

    return results


# ---------------------------------------------------
# 4️⃣ Run
# ---------------------------------------------------

results = train_and_evaluate_models(df)

# View Results
for model_name, metrics in results.items():
    print(f"\n{model_name}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Macro: {metrics['f1_macro']:.4f}")
    print(f"F1 Weighted: {metrics['f1_weighted']:.4f}")


In [None]:
# validation data from sample emails
validation_emails = df.sample(10, random_state=42)

In [7]:
# read cleaned_csv
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/Email_AI/emails_clean.csv")

Mounted at /content/drive


In [8]:
# Drop Outliers
# Drop outliers in body length and subject length
df = df[df['body_length'] < 10000]
df = df[df['subject_length'] < 1000]

# Drop outliers in no:of emails filed in each folder
folder_counts = df['folder'].value_counts()
valid_folders = folder_counts[folder_counts >= 3].index
df = df[df['folder'].isin(valid_folders)]
df.shape

(504121, 22)

Use BERT baseline model and FAISS to predict 3 locations. For testing and evaluvation if 1 among the location is true location mark it as accurate

In [9]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import faiss
from sklearn.model_selection import train_test_split

# --- 1️⃣ Prepare data ---
# df should have columns: 'subject', 'body', and 'folder'

# Combine text columns into a single string for embedding
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

# Filter valid classes (with ≥5 samples)
counts = df['folder'].value_counts()
valid_classes = counts[counts >= 5].index
df = df[df['folder'].isin(valid_classes)]

# --- 2️⃣ Split into train/test ---
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['folder'], random_state=42
)

# --- 3️⃣ Generate BERT embeddings ---
# Using SentenceTransformers (MiniLM = lightweight BERT baseline)
model = SentenceTransformer('all-MiniLM-L6-v2')

train_embeddings = model.encode(
    train_df['text'].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)
test_embeddings = model.encode(
    test_df['text'].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)

# --- 4️⃣ Build FAISS index ---
d = train_embeddings.shape[1]
index = faiss.IndexFlatL2(d)  # L2 distance; use IndexFlatIP for cosine similarity
index.add(train_embeddings)

train_labels = np.array(train_df['folder'].tolist())

# --- 5️⃣ Perform top-3 search ---
k = 3
distances, indices = index.search(test_embeddings, k)

# --- 6️⃣ Evaluate Top-3 Accuracy ---
predicted_top3 = train_labels[indices]
true_labels = np.array(test_df['folder'].tolist())

correct = sum(true in preds for true, preds in zip(true_labels, predicted_top3))
top3_accuracy = correct / len(true_labels)

print(f"✅ Top-3 Accuracy: {top3_accuracy:.3f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/12555 [00:00<?, ?it/s]

Batches:   0%|          | 0/3139 [00:00<?, ?it/s]

✅ Top-3 Accuracy: 0.262


Add more numerical features.

Encode the training set (text + numerical features).

Build a FAISS index on those training vectors.

Encode the test set (the unseen emails).

Query the index with the test vectors to find the nearest training samples (their neighbors).

Evaluate whether the true label of each test email is among the top-k retrieved labels.

So even though the index is built on training data,
the evaluation uses the test data — that’s the whole point of the query step.

Each test email is never part of the index; it’s only used to search and check retrieval accuracy.

In [10]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import faiss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# --- 1️⃣ Prepare data ---
# df should have columns: 'subject', 'body', 'folder', and numerical cols_selected

cols_selected = ['year', 'month', 'body_length', 'subject_length', 'is_replied', 'hour', 'day']

# Combine subject and body for text embedding
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

# Filter valid classes (>=5 samples)
counts = df['folder'].value_counts()
valid_classes = counts[counts >= 5].index
df = df[df['folder'].isin(valid_classes)]

# --- 2️⃣ Split into train/test ---
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['folder'], random_state=42)

# --- 3️⃣ Generate BERT embeddings ---
model = SentenceTransformer('all-MiniLM-L6-v2')
train_text_emb = model.encode(train_df['text'].tolist(), convert_to_numpy=True, show_progress_bar=True)
test_text_emb = model.encode(test_df['text'].tolist(), convert_to_numpy=True, show_progress_bar=True)

# --- 4️⃣ Scale numerical features ---
scaler = StandardScaler()
train_num = scaler.fit_transform(train_df[cols_selected])
test_num = scaler.transform(test_df[cols_selected])

# --- 5️⃣ Concatenate BERT + numerical features ---
train_combined = np.hstack([train_text_emb, train_num])
test_combined = np.hstack([test_text_emb, test_num])

# --- 6️⃣ Build FAISS index ---
d = train_combined.shape[1]
index = faiss.IndexFlatL2(d)
index.add(train_combined)

train_labels = np.array(train_df['folder'].tolist())

# --- 7️⃣ Top-3 search ---
k = 3
distances, indices = index.search(test_combined, k)

# --- 8️⃣ Evaluate Top-3 Accuracy ---
predicted_top3 = train_labels[indices]
true_labels = np.array(test_df['folder'].tolist())

correct = sum(true in preds for true, preds in zip(true_labels, predicted_top3))
top3_accuracy = correct / len(true_labels)

print(f"✅ Hybrid BERT + Numerical Features Top-3 Accuracy: {top3_accuracy:.3f}")


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/12555 [00:00<?, ?it/s]

Batches:   0%|          | 0/3139 [00:00<?, ?it/s]

✅ Hybrid BERT + Numerical Features Top-3 Accuracy: 0.220


LightGBM is one of the best next steps after FAISS, since it can capture non-linear relationships between your BERT embeddings and numerical metadata efficiently

In [11]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import lightgbm as lgb

# --- 1️⃣ Prepare data ---
cols_selected = ['year', 'month', 'body_length', 'subject_length',
                 'is_replied', 'hour', 'day']

# Combine subject and body text
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

# Filter valid classes (>=5 samples)
counts = df['folder'].value_counts()
valid_classes = counts[counts >= 5].index
df = df[df['folder'].isin(valid_classes)]

# Encode target labels
label_encoder = LabelEncoder()
df['folder_encoded'] = label_encoder.fit_transform(df['folder'])

# --- 2️⃣ Train/Test split ---
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['folder_encoded'], random_state=42
)

# --- 3️⃣ Generate BERT embeddings ---
model = SentenceTransformer('all-MiniLM-L6-v2')

train_text_emb = model.encode(train_df['text'].tolist(), convert_to_numpy=True, show_progress_bar=True)
test_text_emb  = model.encode(test_df['text'].tolist(),  convert_to_numpy=True, show_progress_bar=True)

# --- 4️⃣ Add scaled numerical features ---
scaler = StandardScaler()
train_num = scaler.fit_transform(train_df[cols_selected])
test_num  = scaler.transform(test_df[cols_selected])

# Combine embeddings + numerical features
train_X = np.hstack([train_text_emb, train_num])
test_X  = np.hstack([test_text_emb,  test_num])
train_y = train_df['folder_encoded']
test_y  = test_df['folder_encoded']

# --- 5️⃣ Train LightGBM multiclass model ---
num_classes = len(label_encoder.classes_)
params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'max_depth': -1,
    'verbose': -1
}

train_data = lgb.Dataset(train_X, label=train_y)
test_data  = lgb.Dataset(test_X, label=test_y, reference=train_data)

model_lgb = lgb.train(params, train_data, valid_sets=[test_data],
                      num_boost_round=200, early_stopping_rounds=20)

# --- 6️⃣ Evaluate Top-3 Accuracy ---
y_pred_prob = model_lgb.predict(test_X)        # shape: (n_samples, n_classes)
top3_preds = np.argsort(y_pred_prob, axis=1)[:, -3:]  # get top-3 class indices

# Compute Top-3 accuracy
correct = sum(true in preds for true, preds in zip(test_y, top3_preds))
top3_acc = correct / len(test_y)
print(f"✅ LightGBM + BERT + Numerical Top-3 Accuracy: {top3_acc:.3f}")


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/12555 [00:00<?, ?it/s]

Batches:   0%|          | 0/3139 [00:00<?, ?it/s]

TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

Attempt Reinforcement Learning

In [12]:
# RL_policy_email_top3.py
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# ---------- Hyperparams ----------
EMBED_MODEL = 'all-MiniLM-L6-v2'    # frozen BERT-style encoder
cols_selected = ['year', 'month', 'body_length', 'subject_length', 'is_replied', 'hour', 'day']
K = 3                               # top-k actions sampled during training & evaluated
BATCH_SIZE = 64
EPOCHS = 10
LR = 1e-3
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# ---------------------------------

# --- 0. Prepare df as before (assume df exists with subject, body, folder, numeric cols) ---
df = df.copy()
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

# Filter classes with >=5 examples
counts = df['folder'].value_counts()
valid_classes = counts[counts >= 5].index
df = df[df['folder'].isin(valid_classes)].reset_index(drop=True)

# encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['folder'])
num_classes = len(le.classes_)

# standard train/test split (we'll use test for evaluation only)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# --- 1. Obtain (or load) embeddings (frozen) ---
embed_model = SentenceTransformer(EMBED_MODEL)
train_texts = train_df['text'].tolist()
test_texts  = test_df['text'].tolist()

train_emb = embed_model.encode(train_texts, convert_to_numpy=True, show_progress_bar=True)
test_emb  = embed_model.encode(test_texts,  convert_to_numpy=True, show_progress_bar=True)

# --- 2. Scale numeric features and concatenate ---
scaler = StandardScaler()
train_num = scaler.fit_transform(train_df[cols_selected])
test_num  = scaler.transform(test_df[cols_selected])

X_train = np.hstack([train_emb, train_num]).astype(np.float32)
X_test  = np.hstack([test_emb, test_num]).astype(np.float32)
y_train = train_df['label'].values
y_test  = test_df['label'].values

# Convert to torch tensors for training loop
train_X_t = torch.from_numpy(X_train).to(DEVICE)
train_y_t = torch.from_numpy(y_train).to(DEVICE)
test_X_t  = torch.from_numpy(X_test).to(DEVICE)
test_y_t  = torch.from_numpy(y_test).to(DEVICE)

# --- 3. Policy network (simple MLP) ---
class PolicyNet(nn.Module):
    def __init__(self, input_dim, num_actions, hidden=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Linear(hidden//2, num_actions)
        )
    def forward(self, x):
        logits = self.net(x)                 # shape: (batch, num_actions)
        probs = torch.softmax(logits, dim=-1)
        return probs

policy = PolicyNet(input_dim=X_train.shape[1], num_actions=num_classes).to(DEVICE)
optimizer = optim.Adam(policy.parameters(), lr=LR)

# --- 4. Training loop with REINFORCE for top-K sampling w/o replacement ---
def sample_k_without_replacement(probs, k):
    """
    probs: tensor of shape (num_actions,) - must sum to 1
    returns: list of selected action indices, and sum of log_probs for those selections
    We implement sequential sampling: at each step, sample an action according to current probs,
    then mask it out and renormalize for the next pick.
    """
    probs = probs.clone()
    selected = []
    logp_sum = 0.0
    for _ in range(k):
        # Re-normalize to be safe
        probs = probs / probs.sum()
        dist = torch.distributions.Categorical(probs)
        a = dist.sample()
        selected.append(a.item())
        logp_sum = logp_sum + dist.log_prob(a)
        # mask chosen action
        probs[a] = 0.0
    return selected, logp_sum

# moving-average baseline for variance reduction
baseline = 0.0
baseline_alpha = 0.01

num_train = train_X_t.size(0)
indices = np.arange(num_train)

for epoch in range(1, EPOCHS+1):
    np.random.shuffle(indices)
    total_loss = 0.0
    total_reward = 0.0
    policy.train()
    for i in range(0, num_train, BATCH_SIZE):
        batch_idx = indices[i:i+BATCH_SIZE]
        batch_X = train_X_t[batch_idx]         # (B, D)
        batch_y = train_y_t[batch_idx]         # (B,)
        batch_size = batch_X.size(0)

        optimizer.zero_grad()
        batch_probs = policy(batch_X)          # (B, num_actions)
        batch_loss = 0.0
        batch_reward = 0.0

        # For each sample in batch, sample K actions and compute REINFORCE loss
        for b in range(batch_size):
            probs = batch_probs[b]             # (num_actions,)
            true_label = int(batch_y[b].item())

            selected, logp_sum = sample_k_without_replacement(probs, K)
            reward = 1.0 if true_label in selected else 0.0

            # advantage = reward - baseline
            adv = reward - baseline

            # REINFORCE loss (negative expected reward)
            # note: use -logp_sum * adv
            sample_loss = - logp_sum * adv
            batch_loss = batch_loss + sample_loss
            batch_reward = batch_reward + reward

        # average over batch
        batch_loss = batch_loss / batch_size
        batch_loss.backward()
        optimizer.step()

        # update baseline (moving average of reward)
        avg_reward_batch = (batch_reward / batch_size).item()
        baseline = (1 - baseline_alpha) * baseline + baseline_alpha * avg_reward_batch

        total_loss += batch_loss.item() * batch_size
        total_reward += batch_reward

    avg_loss = total_loss / num_train
    avg_reward = total_reward.item() / num_train
    print(f"Epoch {epoch} | loss: {avg_loss:.4f} | train reward (top-{K}): {avg_reward:.4f} | baseline: {baseline:.4f}")

    # optional: evaluate on test after each epoch
    policy.eval()
    with torch.no_grad():
        probs_test = policy(test_X_t)                  # (N_test, num_actions)
        # deterministic top-K prediction (argmax top-K)
        topk = torch.topk(probs_test, K, dim=1).indices.cpu().numpy()   # shape: (N_test, K)
        y_true = test_y_t.cpu().numpy()
        correct = sum(y_true[i] in topk[i] for i in range(len(y_true)))
        topk_acc = correct / len(y_true)
        print(f"  → Eval deterministic Top-{K} accuracy: {topk_acc:.4f}")

# --- final deterministic evaluation ---
policy.eval()
with torch.no_grad():
    probs_test = policy(test_X_t)
    topk = torch.topk(probs_test, K, dim=1).indices.cpu().numpy()
    y_true = test_y_t.cpu().numpy()
    topk_acc = sum(y_true[i] in topk[i] for i in range(len(y_true))) / len(y_true)
print(f"Final deterministic Top-{K} accuracy (policy): {topk_acc:.4f}")


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/12555 [00:00<?, ?it/s]

Batches:   0%|          | 0/3139 [00:00<?, ?it/s]

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [3416]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).