In [None]:
# Version history:
# 2022-12-05: v6: added function set_seed
# 2022-12-04: v5: CatBoost and transformers are added
# 2022-12-03: v4: added random_state=42 to make_label_distribution_equal. Fixed f-string in "Unexpected model tag: {model_tag}". Metrics: 0.497 +- 0.002, 0.502 +- 0.003
# 2022-12-03: v4: added function make_label_distribution_equal, added np.std to final output
# 2022-12-03: v3: added colab section. Metrics: 0.507, 0.497

# ===== Part0 - env preparation =====
## Name, paths

In [None]:
PATH_MOUNT = "/content/drive"
PATH_MAIN_DIR = f"{PATH_MOUNT}/MyDrive/nlp_final_prj/"
PATH_DEFS = f"{PATH_MAIN_DIR}/defs"

## System info

In [None]:
# Print system id
!nvidia-smi
!hostname
!uname -a
!df -kh /tmp

Sat Dec  3 22:32:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    25W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!python -V  # If version < 3.9 then some f-string features may not work

Python 3.8.15


## Mount drive

In [None]:
from google.colab import drive
drive.mount(PATH_MOUNT)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
assert os.path.isdir(PATH_MAIN_DIR)

In [None]:
%cd $PATH_MAIN_DIR
!pwd

/content/drive/MyDrive/nlp_final_prj
/content/drive/MyDrive/nlp_final_prj


# ===== Part 1: prepare dataset =====

## Imports 

In [None]:
import numpy as np
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import time
import torch

## Paths and settings

In [None]:
# Files and folders

DIR_DATA_SRC = r'040_output__nb010_v1'
#FNAMES = ['VIX_RmSW=0_RmRep=0_1y_top10.csv', 'VIX_RmSW=0_RmRep=0_1y_top10.csv' ]  # Loads in <1 sec
FNAMES = ['AMZN_RmSW=0_RmRep=0_1y.csv.gz', 'NFLX_RmSW=0_RmRep=0_1y.csv.gz', ]  # Loads in <1 sec
#FNAMES = ['AAPL_RmSW=0_RmRep=0_1y.csv.gz', ]  # Loads in 20-30 sec

assert os.path.isdir(DIR_DATA_SRC), f"Folder not found: {DIR_DATA_SRC}"
for f in FNAMES:
    assert os.path.isfile(os.path.join(DIR_DATA_SRC, f)), f"File not found: {f}"

AssertionError: Folder not found: 040_output__nb010_v1

In [None]:
# Dataset preparation settings

DROP_RECORDS_BEFORE_DATE_INCLUSIVE = '2019-07-20'  # Last date in datasets is 2020-07-21
LABEL_GEN_STRATEGY = "d1_C=d1_O=0.5%=2cls"  # This string is a "key", see function XXX for explanations
COL_FEATURES = ['symbol', 'message', 'datetime', 'user', 'message_id', 'Date']  #, 'Time']
COL_LABEL = 'label'
COL_PCR = 'price_change_ratio'

# SPLIT_SHUFFLING_SEED = 42  # If None, then no shuffling is done
TEST_SIZE = 0.15
TRAIN_SIZE = 1.0 - TEST_SIZE

## Defs
Here are "pure" functions.

In [None]:
# More info: https://pytorch.org/docs/stable/notes/randomness.html
def init_seeds(seed=42):
    # Python and CPU-related entropy  
    random.seed(seed)      
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # torch.use_deterministic_algorithms(True)   # Raises a CUBLAS error on some cases
    # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Does not help for the error above

    # GPU-related entropy
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.benchmark = False  # See 
        torch.backends.cudnn.deterministic = True

In [None]:
# Function for the "worker_init_fn" param of torch DataLoader
# More info: https://pytorch.org/docs/stable/notes/randomness.html
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    random.seed(worker_seed)
    np.random.seed(worker_seed)

In [None]:
def print_df_details(df: pd.DataFrame):
    print("\nHead:\n", df.head())
    print("\nTail:\n", df.tail())
    print('\nInfo:')
    df.info()  # This method prints by itself
    print('\nDescribe:\n', df.describe(include='all'))  #, datetime_is_numeric=True)) - to suppress warnings   

In [None]:
def load_pandas_file(file_path: str, verbose=True):
    # Prepare
    assert os.path.isfile(file_path), f"Cannot find file: '{file_path}', cur folder: '{os. getcwd()}'"    
    print("Loading data from: ", file_path)
        
    # Do the load
    start_time = time.time()
    df = pd.read_csv(file_path)
    print(f"Success. Shape: {df.shape}, elapsed seconds: {time.time() - start_time:.2f}")
    
    # Dump details if required
    if verbose:
        print_df_details(df)
    return df

In [None]:
def merge_dfs(df_list: list, verbose=True) -> pd.DataFrame:
    if verbose:
        for df in df_list:
            print(df.shape, end=';')
    res_df = pd.concat(df_list, ignore_index=True)
    if verbose:
        print("->", res_df.shape)
    return res_df

In [None]:
def drop_old_dates_inplace(df: pd.DataFrame, drop_date_inclusive: str, verbose=True) -> pd.DataFrame:
    assert isinstance(drop_date_inclusive, str)
    old_shape = df.shape
    df.drop(df[df['Date'] <= drop_date_inclusive].index, inplace = True)
    print(f"Old dates dropped. Shape before: {old_shape}, after: {df.shape}")
    if verbose:
        print_df_details(df)

In [None]:
def get_label(ch):
  if ch > 0.5:
    return 1
  elif ch < -0.5:
    return -1
  else:
    return 0


def generate_labels_and_pcr_list(df: pd.DataFrame, strategy_str: str) -> list:
    # price_change_ratio = pcr 
    if strategy_str == "d1_C=d1_O=0.5%=2cls":
        assert (df['d1_O'] > 0.0).all()  # Prices must be > 0
        assert (df['d1_C'] > 0.0).all()  # Prices must be > 0
        rel_change_perc = (df['d1_C'] / df['d1_O'] - 1.0) * 100.0
        # Convert from percentages to labels -1, 0, 1
        res_series = rel_change_perc.apply(get_label)
    else:
        assert False, "Unexpeced strategy_str"
    return res_series.to_list(), rel_change_perc.to_list()     

In [None]:
def do_feature_selection(df: pd.DataFrame):
    res_df = df[COL_FEATURES]
    print(f"Selected cols: {res_df.columns}")
    return res_df.copy()

In [None]:
def do_label_transformation(df: pd.DataFrame):
    temp_df = df.drop(df[df[COL_LABEL] == 0].index, inplace= False).copy()
    temp_df[COL_LABEL].replace({-1:0}, inplace = True)
    return temp_df

In [None]:
def calc_real_profit_perc(y_pred, pcr_list) -> float:
    return np.NaN  # TODO: This function is not correct, as it's necessary to aggregate predictions by date and ticker

    profit_ratio = 1.0
    assert len(y_pred) == len(pcr_list), f"{len(y_pred)}, {len(pcr_list)}"
    for i, (pred, pcr) in enumerate(zip(y_pred, pcr_list)):
        price_ratio = (pcr / 100.0 + 1.0)  # Convert from percents [-5% .. 5%] -> [-0.05 .. 0.05] -> [0.95 .. 1.05]
        assert 0.0 < price_ratio < np.inf, f"{i}, {price_ratio}" 
        if pred == 1:
            # Long
            profit_ratio *= price_ratio
        elif pred == 0:
            # Short
            profit_ratio /= price_ratio
        else:
            assert False, "Unexpected label"
    return (profit_ratio - 1.0) * 100.0  # Profit in percents (0% - nothing changed)

In [None]:
def calc_hash_for_seq(values, hash_len=6):
    assert isinstance(values, (list, np.ndarray, pd.Series))
    h = hash(tuple(values))
    return str(h)[-hash_len:]

# Small unit tests
print(calc_hash_for_seq([1, 2, 3]))
print(calc_hash_for_seq(np.array([1, 2, 3])))
print(calc_hash_for_seq(pd.Series([1, 2, 3])))

497451
497451
497451


In [None]:
def make_label_distribution_equal(df: pd.DataFrame) -> pd.DataFrame:
    
    counts = df.label.value_counts()
    assert len(counts == 2)  # We expect only labels 0 and 1

    bigger_label = 0 if counts[0] > counts[1] else 1
    diff = abs(counts[0] - counts[1])

    res_df = df.drop(index=df[df.label == bigger_label].sample(n = diff, replace=False, random_state=42).index)
    return res_df

## Do prepare datasets

In [None]:
init_seeds(42)  # May be useful if torch DataLoader is used, etc.

In [None]:
# Load raw data, dropping old dates
df_list = []
for fname in FNAMES:
    full_name = os.path.join(DIR_DATA_SRC, fname)
    assert os.path.isfile(full_name), full_name
    df_temp = load_pandas_file(full_name, verbose=False)
    drop_old_dates_inplace(df_temp, DROP_RECORDS_BEFORE_DATE_INCLUSIVE, verbose=False)
    df_list.append(df_temp)

Loading data from:  040_output__nb010_v1/AMZN_RmSW=0_RmRep=0_1y.csv.gz
Success. Shape: (450379, 52), elapsed seconds: 3.90
Old dates dropped. Shape before: (450379, 52), after: (100280, 52)
Loading data from:  040_output__nb010_v1/NFLX_RmSW=0_RmRep=0_1y.csv.gz
Success. Shape: (666002, 52), elapsed seconds: 5.54
Old dates dropped. Shape before: (666002, 52), after: (110002, 52)


In [None]:
# Concat loaded parts to one dataframe
df_raw = merge_dfs(df_list)

(100280, 52);(110002, 52);-> (210282, 52)


In [None]:
# Choose columns for final dataset
df_final = do_feature_selection(df_raw)

Selected cols: Index(['symbol', 'message', 'datetime', 'user', 'message_id', 'Date'], dtype='object')


In [None]:
# Append the target column
labels, pcr_list = generate_labels_and_pcr_list(df_raw, strategy_str=LABEL_GEN_STRATEGY)
df_final[COL_LABEL] = labels
df_final[COL_PCR] = pcr_list

In [None]:
# Drop labels for neutral class
df_final = do_label_transformation(df_final)

In [None]:
# print_df_details(df_final)

In [None]:
df_final[COL_LABEL].value_counts()

0    64504
1    56793
Name: label, dtype: int64

In [None]:
# Making labels distribution equal
df_final = make_label_distribution_equal(df_final)
df_final[COL_LABEL].value_counts()

0    56793
1    56793
Name: label, dtype: int64

In [None]:
df_final

Unnamed: 0,symbol,message,datetime,user,message_id,Date,label,price_change_ratio
0,AMZN,amzn believe we see a msft reaction,2020-07-22 22:36:53+00:00,1138814,230085465,2020-07-22,0,-3.602092
1,AMZN,ba tsla amzn ge googl new ceo of boeing david ...,2020-07-22 22:36:44+00:00,3433309,230085430,2020-07-22,0,-3.602092
2,AMZN,amzn should be back to 3300 after earning mar...,2020-07-22 22:33:29+00:00,1791337,230084558,2020-07-22,0,-3.602092
3,AMZN,amzn aapl anti trust congressional hearings st...,2020-07-22 22:33:09+00:00,767238,230084470,2020-07-22,0,-3.602092
4,AMZN,tsla musk wants tsla to be quot slightly profi...,2020-07-22 22:29:24+00:00,909664,230083378,2020-07-22,0,-3.602092
...,...,...,...,...,...,...,...,...
210152,NFLX,nflx has one of the better altman z scores in ...,2019-07-22 01:03:58+00:00,47688,171262286,2019-07-22,0,-1.329309
210155,NFLX,just charted nflx gld st ne in the snm room gt...,2019-07-22 00:40:18+00:00,186026,171261315,2019-07-22,0,-1.329309
210156,NFLX,nflx pivot point 318 39 hourly interesting t...,2019-07-22 00:36:45+00:00,2121786,171261168,2019-07-22,0,-1.329309
210157,NFLX,spy fb aapl amzn nflx googl amzn is a beast so...,2019-07-22 00:28:08+00:00,1323307,171260827,2019-07-22,0,-1.329309


# ===== Part 2: Model execution and scoring =====

## Imports (part 2)

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

## Defs (part 2)

In [None]:
def train_model_and_get_predictions__sklearn_classifier(model_tag: str, 
    X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

    # Initial checks
    assert COL_PCR not in X_train.columns  # To avoid data leaks
    
    # Create the model with specified seed
    if model_tag == "dummy__most_frequent":
        model = DummyClassifier(strategy="most_frequent", random_state=seed)
    elif model_tag == "dummy__uniform":
        model = DummyClassifier(strategy="uniform", random_state=seed)
    else:
        assert False, f"Unexpected model tag: {model_tag}"
    
    # Train the model    
    model.fit(X_train, y_train)
    
    # Get predictions
    y_pred = model.predict(X_test)
    
    return y_pred    

In [None]:
!pip install catboost
from catboost import CatBoostClassifier

def train_model_and_get_predictions__catboost(model_tag: str, 
    X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

    # Initial checks
    assert COL_PCR not in X_train.columns  # To avoid data leaks
    
    model = CatBoostClassifier(n_estimators=300,
                                max_depth = 8,
                                task_type = 'GPU',
                                verbose = 0
                                )
    
    # Train the model    
    model.fit(X_train.loc['message'], y_train, text_features=['message'])
    
    # Get predictions
    y_pred = model.predict(X_test['message'])
    
    return y_pred   

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
!pip install catboost

from catboost import CatBoostClassifier

def train_model_and_get_predictions__catboost(model_tag: str, 
    X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

    # Initial checks
    assert COL_PCR not in X_train.columns  # To avoid data leaks
    
    model = CatBoostClassifier(
                                n_estimators=300,
                                max_depth = 8,
                                task_type = 'GPU',
                                verbose = 0
                                )
    
    # Train the model    
    model.fit(X_train.loc[:,['message']], y_train, text_features=['message'])
    
    # Get predictions
    y_pred = model.predict(X_test.loc[:,['message']])
    
    return y_pred   

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install wandb
!pip install simpletransformers

import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs

def train_model_and_get_predictions__transformer(model_tag: str, 
    X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

    # Initial checks
    assert COL_PCR not in X_train.columns  # To avoid data leaks
    
    is_cuda = True if torch.cuda.is_available() else False

    model_args = ClassificationArgs()
    model_args.num_train_epochs = 2
    model_args.regression = False
    model_args.use_multiprocessing=is_cuda
    model_args.use_multiprocessing_for_evaluation=is_cuda
    model_args.overwrite_output_dir=True
    model_args.train_batch_size = 128

    model = ClassificationModel(
                                "roberta",
                                "distilroberta-base",
                                num_labels=2,
                                use_cuda=is_cuda,
                                args=model_args
                                )
    
    # Train the model    
    tmp_df = pd.concat([X_train['message'], y_train], axis=1)
    tmp_df.columns = ["text", "labels"]
    model.train_model(tmp_df)
    
    # Get predictions
    y_pred, _ = model.predict(list(X_test['message']))
    
    return y_pred   

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Launch split-train-predict-metrics cycle for several seeds
def get_model_score_distribution(model_tag: str, df: pd.DataFrame, launch_cnt: int = 5, verbose=True):
    result = []
    print("Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum")
    for seed in range(42, 42 + launch_cnt):
        X_train, X_test, y_train, y_test = train_test_split(
            df[COL_FEATURES + [COL_PCR]], df[COL_LABEL],
            # stratify=df[COL_LABEL], # Note: stratification leads to the same test set (though shuffled)
            shuffle=True,
            random_state = seed, 
            test_size = TEST_SIZE
        )
        # Note: equal hash means binary equality, equal sum means the same rows but shuffled
        print(f"After split: {seed}, {X_train.shape}; {X_test.shape}; {y_train.shape},{calc_hash_for_seq(y_train)},{sum(y_train)};"
              + f" {y_test.shape},{calc_hash_for_seq(y_test)},{sum(y_test)}")

        # Separate price_change_ratio from the data
        pcr_train = X_train[COL_PCR]; X_train.drop(COL_PCR, axis=1, inplace=True)
        pcr_test = X_test[COL_PCR]; X_test.drop(COL_PCR, axis=1, inplace=True)

        # Launch model-specific method
        y_pred = None
        if model_tag.startswith('dummy_'):
            y_pred = train_model_and_get_predictions__sklearn_classifier(model_tag, X_train, y_train, X_test, seed)
        elif model_tag =='catboost':
            y_pred = train_model_and_get_predictions__catboost(model_tag, X_train, y_train, X_test, seed)
        elif model_tag =='transformer':
            y_pred = train_model_and_get_predictions__transformer(model_tag, X_train, y_train, X_test, seed)
        else:
            assert False, f"Unexpected model tag: {model_tag}"

        # Calc score
        score1 = accuracy_score(y_test, y_pred)
        #score2 = calc_real_profit_perc(y_pred, pcr_test)
        #score3 = calc_real_profit_perc(y_train[:100], pcr_train[:100])
        #result.append(f"{score1:.5f}, {score2:.2f}%, {score3:.2f}%")
        result.append(score1)
            
        if verbose:
            print(confusion_matrix(y_test, y_pred))
            print(classification_report(y_test, y_pred, digits=3))
                    
    return result

## Launch the model training/estimation

In [None]:
model_tag = "dummy__most_frequent"
results = get_model_score_distribution(model_tag, df_final, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum
After split: 42, (96548, 7); (17038, 7); (96548,),077111,48332; (17038,),802379,8461
After split: 43, (96548, 7); (17038, 7); (96548,),325290,48367; (17038,),784870,8426
After split: 44, (96548, 7); (17038, 7); (96548,),158103,48299; (17038,),603758,8494
After split: 45, (96548, 7); (17038, 7); (96548,),225964,48292; (17038,),863571,8501
After split: 46, (96548, 7); (17038, 7); (96548,),678696,48321; (17038,),537076,8472
Sorted results (accuracy): [0.4945416128653598, 0.4965958445826975, 0.49724146026528937, 0.49853269163047303, 0.4989435379739406]
Mean accuracy: 0.497 +- 0.002


In [None]:
model_tag = "dummy__uniform"
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum
After split: 42, (96548, 7); (17038, 7); (96548,),077111,48332; (17038,),802379,8461
After split: 43, (96548, 7); (17038, 7); (96548,),325290,48367; (17038,),784870,8426
After split: 44, (96548, 7); (17038, 7); (96548,),158103,48299; (17038,),603758,8494
After split: 45, (96548, 7); (17038, 7); (96548,),225964,48292; (17038,),863571,8501
After split: 46, (96548, 7); (17038, 7); (96548,),678696,48321; (17038,),537076,8472
Sorted results (accuracy): [0.49812184528700554, 0.5003521540086865, 0.5013499236999648, 0.5018781547129945, 0.5072778495128536]
Mean accuracy: 0.502 +- 0.003


In [None]:
model_tag = "catboost"
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum
After split: 42, (96548, 7); (17038, 7); (96548,),077111,48332; (17038,),802379,8461
After split: 43, (96548, 7); (17038, 7); (96548,),325290,48367; (17038,),784870,8426
After split: 44, (96548, 7); (17038, 7); (96548,),158103,48299; (17038,),603758,8494
After split: 45, (96548, 7); (17038, 7); (96548,),225964,48292; (17038,),863571,8501
After split: 46, (96548, 7); (17038, 7); (96548,),678696,48321; (17038,),537076,8472
Sorted results (accuracy): [0.555992487381148, 0.5587510271158587, 0.5610987205071017, 0.5623312595375044, 0.5633877215635638]
Mean accuracy: 0.560 +- 0.003


In [None]:
model_tag = "transformer"
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum
After split: 42, (96548, 7); (17038, 7); (96548,),077111,48332; (17038,),802379,8461


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/96548 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

  0%|          | 0/17038 [00:00<?, ?it/s]

  0%|          | 0/2130 [00:00<?, ?it/s]

After split: 43, (96548, 7); (17038, 7); (96548,),325290,48367; (17038,),784870,8426


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

  0%|          | 0/96548 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

  0%|          | 0/17038 [00:00<?, ?it/s]

  0%|          | 0/2130 [00:00<?, ?it/s]

After split: 44, (96548, 7); (17038, 7); (96548,),158103,48299; (17038,),603758,8494


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

  0%|          | 0/96548 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

  0%|          | 0/17038 [00:00<?, ?it/s]

  0%|          | 0/2130 [00:00<?, ?it/s]

After split: 45, (96548, 7); (17038, 7); (96548,),225964,48292; (17038,),863571,8501


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

  0%|          | 0/96548 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

  0%|          | 0/17038 [00:00<?, ?it/s]

  0%|          | 0/2130 [00:00<?, ?it/s]

After split: 46, (96548, 7); (17038, 7); (96548,),678696,48321; (17038,),537076,8472


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

  0%|          | 0/96548 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/755 [00:00<?, ?it/s]

  0%|          | 0/17038 [00:00<?, ?it/s]

  0%|          | 0/2130 [00:00<?, ?it/s]

Sorted results (accuracy): [0.5419063270336894, 0.5433736354032164, 0.5434910200727785, 0.5463082521422702, 0.5488320225378566]
Mean accuracy: 0.545 +- 0.002
