In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix, vstack, hstack

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD

from collections import Counter

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import SMOTE, RandomOverSampler,SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek

from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier

from rank_bm25 import BM25Okapi

In [2]:
train_d1 = pd.read_json("domain1_train_data.json", lines=True)
train_d2 = pd.read_json("domain2_train_data.json", lines=True)
test = pd.read_json("test_data.json", lines=True)

In [3]:
train_d1['text_length'] = train_d1['text'].apply(len)
train_d2['text_length'] = train_d2['text'].apply(len)
test['text_length'] = test['text'].apply(len)

train_d1['unique_tokens'] = train_d1['text'].apply(lambda x: len(set(x)))
train_d1['token_diversity'] = train_d1['unique_tokens'] / train_d1['text_length']

train_d2['unique_tokens'] = train_d2['text'].apply(lambda x: len(set(x)))
train_d2['token_diversity'] = train_d2['unique_tokens'] / train_d2['text_length']

test['unique_tokens'] = test['text'].apply(lambda x: len(set(x)))
test['token_diversity'] = test['unique_tokens'] / test['text_length']

train_d1["domain"] = 0
train_d2["domain"] = 1

In [4]:
vocab_size = 17119 + 1

merge = pd.concat([train_d1, train_d2], ignore_index=True)
merge = shuffle(merge, random_state=42)

### Prepare the data

In [5]:
# create a BoW from token indices
def tokens_to_bow(texts, vocab_size):
    rows, cols, counts = [], [], []
    for row_idx, text in enumerate(texts):
        token_counts = {}
        for token in text:
            token_counts[token] = token_counts.get(token, 0) + 1
        for token, count in token_counts.items():
            rows.append(row_idx)
            cols.append(token)
            counts.append(count)
    return csr_matrix((counts, (rows, cols)), shape=(len(texts), vocab_size))

In [6]:
def tokens_to_ngram_bow(texts, vocab_size, n_list=(1,2,3), min_freq=3):
    rows, cols, counts = [], [], []
    next_id  = vocab_size
    ngram_id = {}      # {tuple: id}
    freq_cnt = Counter()

    # 1st pass: count n‑gram freq
    for tokens in texts:
        for n in n_list[1:]:                 # skip unigram
            for i in range(len(tokens)-n+1):
                freq_cnt[tuple(tokens[i:i+n])] += 1

    # 2nd pass: build sparse rows
    for r, tokens in enumerate(texts):
        token_counts = Counter(tokens)       # unigram
        # add n‑gram
        for n in n_list[1:]:
            for i in range(len(tokens)-n+1):
                t = tuple(tokens[i:i+n])
                if freq_cnt[t] >= min_freq:
                    if t not in ngram_id:
                        ngram_id[t] = next_id
                        next_id += 1
                    token_counts[ngram_id[t]] += 1
        for c, cnt in token_counts.items():
            rows.append(r); cols.append(c); counts.append(cnt)

    X = csr_matrix((counts,(rows,cols)), shape=(len(texts), next_id))
    return X

In [7]:
def build_numeric_features(text_series, token_freq, rare_th=5):
    """
    Return: ndarray shape (n_samples, n_features)
            columns = [seq_len, mean_token_id, rare_ratio, token_diversity]
    """
    # 序列长度 (sequence length)
    seq_len = text_series.apply(len).values
    
    # 平均 token 索引 (mean token id)
    mean_token_id = text_series.apply(lambda ts: np.mean(ts) if len(ts) > 0 else 0).values
    
    # 罕见词占比 (rare_token_ratio)
    rare_ratio = text_series.apply(
        lambda ts: sum(token_freq[t] < rare_th for t in ts) / len(ts) if len(ts) > 0 else 0
    ).values

    # # num unique tokens in sentences
    # token_uniq = text_series.apply(lambda x: len(set(x))).values
    
    # 词多样性 (token_diversity = unique / length)
    token_div = text_series.apply(
        lambda ts: len(set(ts)) / len(ts) if len(ts) > 0 else 0
    ).values
    
    feats = np.vstack([seq_len, token_div]).T
    return feats.astype(np.float32)

In [8]:
# Token frequency percentiles and variance metrics
def build_frequency_features(text_series, token_freq):
    """
    Extract token frequency percentiles and variance metrics
    
    Returns:
        ndarray shape (n_samples, 6) with columns:
        [min_freq, 25th_pct, median_freq, 75th_pct, max_freq, freq_variance]
    """
    # Calculate frequency-based features for each text
    frequency_features = []
    
    for tokens in text_series:
        if not tokens:
            frequency_features.append([0, 0, 0, 0, 0, 0])
            continue
            
        # Get frequencies for all tokens in this document
        freqs = [token_freq[t] for t in tokens]
        
        # Calculate percentiles and variance
        min_freq = min(freqs) if freqs else 0
        q1 = np.percentile(freqs, 25)
        median = np.percentile(freqs, 50)
        q3 = np.percentile(freqs, 75)
        max_freq = max(freqs) if freqs else 0
        variance = np.var(freqs) if len(freqs) > 1 else 0
        
        frequency_features.append([min_freq, q1, median, q3, max_freq, variance])
    
    return np.array(frequency_features, dtype=np.float32)

In [9]:
# 只用 train 数据，避免信息泄漏 (data leakage)
token_freq = Counter()
for tokens in pd.concat([train_d1, train_d2], ignore_index=True)["text"]:
    token_freq.update(tokens)

# 0. 拼接所有文本
all_texts = pd.concat([merge["text"], test["text"]], ignore_index=True)

# 1. 一次性生成 n‑gram BoW
X_all = tokens_to_ngram_bow(all_texts, vocab_size, n_list=(1,2,3), min_freq=3)

# 2. 切分回 train / test
X_bow   = X_all[:len(merge)]
test_bow = X_all[len(merge):]

# 3. Apply TF-IDF transformation
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_bow)
X_test_tfidf = tfidf_transformer.transform(test_bow)

# 4. Seq length and token diversity features
num_feats_train = build_numeric_features(merge["text"], token_freq)   # ndarray
num_feats_test  = build_numeric_features(test["text"],  token_freq)

# 5. Token frequency percentiles and variance
train_freq_feats = build_frequency_features(merge["text"], token_freq)
test_freq_feats = build_frequency_features(test["text"], token_freq)

# 6. Scale numerical features
num_scaler = MaxAbsScaler()
train_num_feats_scaled = num_scaler.fit_transform(num_feats_train)
test_num_feats_scaled = num_scaler.transform(num_feats_test)

freq_scaler = MaxAbsScaler()
train_freq_feats_scaled = freq_scaler.fit_transform(train_freq_feats)
test_freq_feats_scaled = freq_scaler.transform(test_freq_feats)

# 7. Concatenate all features for train and test separately
X_train_final = hstack([
    X_bow,
    # X_train_tfidf,
    train_num_feats_scaled,
    # train_freq_feats_scaled
]).tocsr()

X_test_final = hstack([
    test_bow,
    # X_test_tfidf,
    test_num_feats_scaled,
    # test_freq_feats_scaled
]).tocsr()

y = merge["domain"]

In [10]:
def predict_by_similarity(X_test, X_train, y_train, k=3):
    # Calculate cosine similarity between each test and train instance
    # This will give a matrix of shape (n_test, n_train)
    similarity_matrix = cosine_similarity(X_test, X_train)
    
    # For each test instance, find the indices of the k most similar training instances
    # argsort sorts in ascending order, so we take the last k elements
    top_k_indices = np.argsort(similarity_matrix, axis=1)[:, -k:]
    
    mean_predictions = np.array([
        np.mean(y_train.iloc[indices]) for indices in top_k_indices
    ])
    
    return mean_predictions

In [11]:
def predict_by_bm25_simple(X_test, X_train, y_train, k=3):
    # Convert sparse matrices to lists of indices where values are non-zero
    train_docs = [X_train[i].nonzero()[1] for i in range(X_train.shape[0])]
    test_docs = [X_test[i].nonzero()[1] for i in range(X_test.shape[0])]
    
    # Create BM25 object
    bm25 = BM25Okapi(train_docs)
    
    # For each test document, get BM25 scores for all training documents
    top_k_indices = []
    for test_doc in test_docs:
        scores = bm25.get_scores(test_doc)
        top_indices = np.argsort(scores)[-k:]
        top_k_indices.append(top_indices)
    
    mean_predictions = np.array([
        np.mean(y_train.iloc[indices]) for indices in top_k_indices
    ])
    
    return mean_predictions

In [12]:
def augment_tta(text_indices, swap_prob=0.1, del_prob=0.05, insert_prob=0.1):
    """
    Apply test-time augmentation to token indices with specified probabilities
    """
    augmented = text_indices.copy()
    n = len(augmented)
    
    # Random Swapping
    if n >= 2:
        for i in range(n-1):
            if np.random.random() < swap_prob:
                augmented[i], augmented[i+1] = augmented[i+1], augmented[i]
    
    # Random Deletion
    if n > 0:
        mask = np.random.random(n) > del_prob
        augmented = augmented[mask]
    
    # Random Insertion
    if n > 0 and len(augmented) > 0:
        insertions = int(insert_prob * n)
        for _ in range(insertions):
            if len(augmented) == 0:
                break
            # Insert a copy of random existing token
            pos = np.random.randint(0, len(augmented)+1)
            token = np.random.choice(augmented)
            augmented = np.insert(augmented, pos, token)
    
    return augmented

def apply_tta(X_test, n_augmentations=5):
    """
    Apply test-time augmentation to generate multiple augmented versions
    """
    test_indices_list = [X_test[i].nonzero()[1] for i in range(X_test.shape[0])]
    
    augmented_test_matrices = []
    
    for _ in range(n_augmentations):
        aug_indices = []
        for indices in test_indices_list:
            # Apply augmentation with conservative probabilities
            aug = augment_tta(
                indices,
                swap_prob=0.05,  # 5% swap probability
                del_prob=0.05,   # 5% deletion probability
                insert_prob=0.05 # 5% insertion probability
            )
            aug_indices.append(aug)
        
        # Convert back to sparse matrix
        rows, cols, counts = [], [], []
        for row_idx, indices in enumerate(aug_indices):
            for token in indices:
                rows.append(row_idx)
                cols.append(token)
                counts.append(1)
        
        X_aug = csr_matrix((counts, (rows, cols)), 
                          shape=(len(aug_indices), X_test.shape[1]))
        augmented_test_matrices.append(X_aug)
    
    return augmented_test_matrices

### Domain Classifier

In [13]:
## out of fold predictions
oof_preds = np.zeros(len(y), dtype=int)
oof_probs = np.zeros(len(y))

test_preds = np.zeros(len(test))

FOLDS = 20
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for train_index, val_index in skf.split(X_train_final, y):
    X_train = X_train_final[train_index]
    X_val = X_train_final[val_index]

    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    smote = SMOTE(random_state=42, sampling_strategy=0.6)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Model prediction
    clf1 = LogisticRegression(max_iter=1000, C=0.1, solver='liblinear', random_state=42)
    clf2 = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber", alpha=0.1, penalty='l2')
    # clf3 = RandomForestClassifier(n_estimators=200) 
    eclf = VotingClassifier(estimators=[('lr', clf1), ('sgd', clf2)], voting='soft')

    eclf.fit(X_train_resampled, y_train_resampled)

    # VALIDATION: Apply TTA to validation set
    X_val_aug_list = apply_tta(X_val, n_augmentations=2)
    
    # Make predictions on all augmented validation versions
    val_probs = np.zeros(X_val.shape[0])
    for X_val_aug in X_val_aug_list:
        val_probs += eclf.predict_proba(X_val_aug)[:, 1]
    val_probs /= len(X_val_aug_list)
    
    oof_preds[val_index] = (val_probs > 0.5).astype(int)

    # Apply TTA with 5 augmentations
    X_test_aug_list = apply_tta(X_test_final, n_augmentations=2)
    
    # Make predictions on all augmented versions
    test_probs = np.zeros(len(test))
    for X_test_aug in X_test_aug_list:
        test_probs += eclf.predict_proba(X_test_aug)[:, 1]
    test_probs /= len(X_test_aug_list)

    # # direct output for valication data, probability for test for mean value later
    # oof_preds[val_index] = eclf.predict(X_val)
    # oof_probs[val_index] += eclf.predict_proba(X_val)[:, 1]

    # use model for test prediction
    test_preds += test_probs

test_preds /= FOLDS # Average across folds

test_labels = (test_preds > 0.5).astype(int)
test_labels



array([1, 0, 1, ..., 1, 0, 1])

In [14]:
acc = accuracy_score(y, oof_preds)
print(f"Overall OOF accuracy with model pred: {acc:.4f}")

Overall OOF accuracy with model pred: 0.9485


In [15]:
print(classification_report(y, oof_preds))

              precision    recall  f1-score   support

           0       0.80      0.92      0.86      1000
           1       0.98      0.95      0.97      5000

    accuracy                           0.95      6000
   macro avg       0.89      0.94      0.91      6000
weighted avg       0.95      0.95      0.95      6000



### Label classifier

In [16]:
test["domain_pred"] = test_labels

test_d1 = test[test['domain_pred'] == 0].copy()
test_d2 = test[test['domain_pred'] == 1].copy()

test_d1.reset_index(drop=True, inplace=True)
test_d2.reset_index(drop=True, inplace=True)

### D1 Training

In [17]:
# 只用 train 数据，避免信息泄漏 (data leakage)
token_freq_d1 = Counter()
for tokens in train_d1["text"]:
    token_freq_d1.update(tokens)

# 0. 拼接所有文本
d1_texts = pd.concat([train_d1["text"], test_d1["text"]], ignore_index=True)

# 1. 一次性生成 n‑gram BoW
X_d1_all = tokens_to_ngram_bow(d1_texts, vocab_size, n_list=(1,2,3), min_freq=3)

# 2. 切分回 train / test
X_d1_bow   = X_d1_all[:len(train_d1)]
test_d1_bow = X_d1_all[len(train_d1):]

# 3. Apply TF-IDF transformation
d1_tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
X_train_d1_tfidf = d1_tfidf_transformer.fit_transform(X_d1_bow)
X_test_d1_tfidf = d1_tfidf_transformer.transform(test_d1_bow)

# 4. Seq length and token diversity features
num_feats_train_d1 = build_numeric_features(train_d1["text"], token_freq_d1)   # ndarray
num_feats_test_d1  = build_numeric_features(test_d1["text"],  token_freq_d1)

# 5. Token frequency percentiles and variance
train_freq_feats_d1 = build_frequency_features(train_d1["text"], token_freq_d1)
test_freq_feats_d1 = build_frequency_features(test_d1["text"], token_freq_d1)

# 6. Scale numerical features
d1_num_scaler = MaxAbsScaler()
train_num_feats_scaled_d1 = d1_num_scaler.fit_transform(num_feats_train_d1)
test_num_feats_scaled_d1 = d1_num_scaler.transform(num_feats_test_d1)

d1_freq_scaler = MaxAbsScaler()
train_freq_feats_scaled_d1 = d1_freq_scaler.fit_transform(train_freq_feats_d1)
test_freq_feats_scaled_d1 = d1_freq_scaler.transform(test_freq_feats_d1)

# 7. Concatenate all features for train and test separately
X_train_final_d1 = hstack([
    X_d1_bow,
    # X_train_d1_tfidf,
    train_num_feats_scaled_d1,
    # train_freq_feats_scaled_d1
]).tocsr()

X_test_final_d1 = hstack([
    test_d1_bow,
    # X_test_d1_tfidf,
    test_num_feats_scaled_d1,
    # test_freq_feats_scaled_d1
]).tocsr()

y_d1 = train_d1["label"]

In [18]:
## out of fold predictions
oof_d1_preds = np.zeros(len(y_d1), dtype=int)
oof_d1_probs = np.zeros(len(y_d1))

test_d1_preds = np.zeros(len(test_d1))

FOLDS = 20
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for train_index, val_index in skf.split(X_train_final_d1, y_d1):
    X_train = X_train_final_d1[train_index]
    X_val = X_train_final_d1[val_index]

    y_train = y_d1.iloc[train_index]
    y_val = y_d1.iloc[val_index]

    # Model prediction
    clf1 = LogisticRegression(max_iter=1000, C=0.1, solver='liblinear', random_state=42)
    clf2 = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber", alpha=0.1, penalty='l2')
    # clf3 = RandomForestClassifier(n_estimators=200) 
    eclf = VotingClassifier(estimators=[('lr', clf1), ('sgd', clf2)], voting='soft')

    eclf.fit(X_train, y_train)

    # VALIDATION: Apply TTA to validation set
    X_val_aug_list = apply_tta(X_val, n_augmentations=2)
    
    # Make predictions on all augmented validation versions
    val_probs = np.zeros(X_val.shape[0])
    for X_val_aug in X_val_aug_list:
        val_probs += eclf.predict_proba(X_val_aug)[:, 1]
    val_probs /= len(X_val_aug_list)
    
    oof_d1_preds[val_index] = (val_probs > 0.5).astype(int)

    # Apply TTA with 5 augmentations
    X_test_aug_list = apply_tta(X_test_final_d1, n_augmentations=2)
    
    # Make predictions on all augmented versions
    test_probs = np.zeros(len(test_d1))
    for X_test_aug in X_test_aug_list:
        test_probs += eclf.predict_proba(X_test_aug)[:, 1]
    test_probs /= len(X_test_aug_list)

    # # direct output for valication data, probability for test for mean value later
    # oof_d1_preds[val_index] = eclf.predict(X_val)
    # oof_d1_probs[val_index] += eclf.predict_proba(X_val)[:, 1]

    # use model for test prediction
    test_d1_preds += test_probs

test_d1_preds /= FOLDS # Average across folds

test_d1_labels = (test_d1_preds > 0.5).astype(int)
test_d1_labels

array([0, 1, 0, ..., 1, 0, 0])

In [19]:
acc = accuracy_score(y_d1, oof_d1_preds)
print(f"Overall OOF accuracy with model pred: {acc:.4f}")

Overall OOF accuracy with model pred: 0.9550


In [20]:
print(classification_report(y_d1, oof_d1_preds))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       500
           1       0.97      0.94      0.95       500

    accuracy                           0.95      1000
   macro avg       0.96      0.95      0.95      1000
weighted avg       0.96      0.95      0.95      1000



### D2 Training

In [21]:
# 只用 train 数据，避免信息泄漏 (data leakage)
token_freq_d2 = Counter()
for tokens in train_d2["text"]:
    token_freq_d2.update(tokens)

In [22]:
# 0. 拼接所有文本
d2_texts = pd.concat([train_d2["text"], test_d2["text"]], ignore_index=True)

# 1. 一次性生成 n‑gram BoW
X_d2_all = tokens_to_ngram_bow(d2_texts, vocab_size, n_list=(1,2,3), min_freq=3)

# 2. 切分回 train / test
X_d2_bow   = X_d2_all[:len(train_d2)]
test_d2_bow = X_d2_all[len(train_d2):]

# 3. Apply TF-IDF transformation
d2_tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
X_train_d2_tfidf = d2_tfidf_transformer.fit_transform(X_d2_bow)
X_test_d2_tfidf = d2_tfidf_transformer.transform(test_d2_bow)

# 4. Seq length and token diversity features
num_feats_train_d2 = build_numeric_features(train_d2["text"], token_freq_d2)   # ndarray
num_feats_test_d2  = build_numeric_features(test_d2["text"],  token_freq_d2)

# 5. Token frequency percentiles and variance
train_freq_feats_d2 = build_frequency_features(train_d2["text"], token_freq_d2)
test_freq_feats_d2 = build_frequency_features(test_d2["text"], token_freq_d2)

# 6. Scale numerical features
d2_num_scaler = MaxAbsScaler()
train_num_feats_scaled_d2 = d2_num_scaler.fit_transform(num_feats_train_d2)
test_num_feats_scaled_d2 = d2_num_scaler.transform(num_feats_test_d2)

d2_freq_scaler = MaxAbsScaler()
train_freq_feats_scaled_d2 = d2_freq_scaler.fit_transform(train_freq_feats_d2)
test_freq_feats_scaled_d2 = d2_freq_scaler.transform(test_freq_feats_d2)

# 7. Concatenate all features for train and test separately
X_train_final_d2 = hstack([
    X_d2_bow,
    # X_train_d2_tfidf,
    train_num_feats_scaled_d2,
    # train_freq_feats_scaled_d2
]).tocsr()

X_test_final_d2 = hstack([
    test_d2_bow,
    # X_test_d2_tfidf,
    test_num_feats_scaled_d2,
    # test_freq_feats_scaled_d2
]).tocsr()

y_d2 = train_d2["label"]

In [23]:
## out of fold predictions
oof_d2_preds = np.zeros(len(y_d2), dtype=int)
oof_d2_probs = np.zeros(len(y_d2))

test_d2_preds = np.zeros(len(test_d2))

FOLDS = 20
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for train_index, val_index in skf.split(X_train_final_d2, y_d2):
    X_train = X_train_final_d2[train_index]
    X_val = X_train_final_d2[val_index]

    y_train = y_d2.iloc[train_index]
    y_val = y_d2.iloc[val_index]

    smote = SMOTE(random_state=42, sampling_strategy=0.5)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Model prediction
    clf1 = LogisticRegression(max_iter=1000, C=0.1, solver='liblinear', random_state=42)
    clf2 = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber", alpha=0.1, penalty='l2')
    # clf3 = RandomForestClassifier(n_estimators=200) 
    eclf = VotingClassifier(estimators=[('lr', clf1), ('sgd', clf2)], voting='soft')

    eclf.fit(X_train_resampled, y_train_resampled)

    # VALIDATION: Apply TTA to validation set
    X_val_aug_list = apply_tta(X_val, n_augmentations=2)
    
    # Make predictions on all augmented validation versions
    val_probs = np.zeros(X_val.shape[0])
    for X_val_aug in X_val_aug_list:
        val_probs += eclf.predict_proba(X_val_aug)[:, 1]
    val_probs /= len(X_val_aug_list)
    
    oof_d2_preds[val_index] = (val_probs > 0.5).astype(int)

    # Apply TTA with 5 augmentations
    X_test_aug_list = apply_tta(X_test_final_d2, n_augmentations=2)
    
    # Make predictions on all augmented versions
    test_probs = np.zeros(len(test_d2))
    for X_test_aug in X_test_aug_list:
        test_probs += eclf.predict_proba(X_test_aug)[:, 1]
    test_probs /= len(X_test_aug_list)

    # # direct output for valication data, probability for test for mean value later
    # oof_d2_preds[val_index] = eclf.predict(X_val)
    # oof_d2_probs[val_index] += eclf.predict_proba(X_val)[:, 1]

    # use model for test prediction
    test_d2_preds += test_probs

test_d2_preds /= FOLDS # Average across folds

test_d2_labels = (test_d2_preds > 0.5).astype(int)
test_d2_labels



array([1, 0, 1, ..., 1, 0, 1])

In [24]:
acc = accuracy_score(y_d2, oof_d2_preds)
print(f"Overall OOF accuracy with model pred: {acc:.4f}")

Overall OOF accuracy with model pred: 0.9382


In [25]:
print(classification_report(y_d2, oof_d2_preds))

              precision    recall  f1-score   support

           0       0.44      0.91      0.60       250
           1       0.99      0.94      0.97      4750

    accuracy                           0.94      5000
   macro avg       0.72      0.92      0.78      5000
weighted avg       0.97      0.94      0.95      5000



### Submission

In [26]:
test_d1["label"] = test_d1_labels
test_d2["label"] = test_d2_labels

test_preds_df = pd.concat([test_d1, test_d2])
test_preds_df = test_preds_df.sort_values('id')
test_preds_df.head()

Unnamed: 0,text,id,text_length,unique_tokens,token_diversity,domain_pred,label
0,"[9159, 3048, 238, 276, 162, 286, 305, 22, 36, ...",0,251,90,0.358566,1,1
0,"[64, 5039, 1275, 6, 0, 871, 139, 270, 327, 237...",1,199,114,0.572864,0,0
1,"[327, 618, 76, 650, 121, 274, 1025, 0, 12207, ...",2,355,204,0.574648,1,0
1,"[6, 12, 609, 11905, 4, 879, 677, 78, 13352, 60...",3,102,69,0.676471,0,1
2,"[1, 5504, 55, 22, 101, 3783, 139, 2664, 4, 1, ...",4,144,95,0.659722,0,0


In [27]:
sub = pd.read_csv("sample.csv")
sub["class"] = test_preds_df["label"].values
sub.to_csv("submission.csv", index=False)