In [1]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import numpy as np
import pandas as pd
import pickle
from pathlib import Path

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [5]:
DATA_PATH_IN = '../../data/'
DATA_PATH_OUT = '../data/Experiment_with_chunking/'

In [6]:
with open(DATA_PATH_IN + 'text_dataset.pkl', 'rb') as f:
    text_data = pickle.load(f)

In [7]:
text_data.head()

Unnamed: 0,cell-type,source,title,tag
0,code,import pandas as pd import numpy as np import ...,0-9-try-better-parameters-better-score.ipynb,regression
1,markdown,try to overfit more pls upvote if you fork lik...,0-9-try-better-parameters-better-score.ipynb,regression
2,markdown,sub1 0 869,0-9-try-better-parameters-better-score.ipynb,regression
3,markdown,credit very simple code with score 0 886 by ay...,0-9-try-better-parameters-better-score.ipynb,regression
4,code,import pandas as pd import numpy as np from sk...,0-9-try-better-parameters-better-score.ipynb,regression


In [8]:
text_data.shape

(332354, 4)

*Credits: https://towardsdatascience.com/how-to-do-average-and-max-word-embedding-for-long-sentences-f3531e99d998*

In [9]:
def chunking(max_len, sent):
    """because the embedding function is trained on dim 512, so we have to limit the size of the sentences using max_len so the final chunked sentences wont exceed length 510
    Args:
        max_len (int): maximum number of tokens for each chunk
        sent (str): input sentence
    Returns:
        sent_chunks (List(str)): list of tokenized and chunked sentences
    """

    bert_tokenized_text = tokenizer.tokenize(sent)


    if len(bert_tokenized_text) > max_len:
        # using list comprehension to divide the sequence
        final = [
            bert_tokenized_text[i * max_len : (i + 1) * max_len]
            for i in range((len(bert_tokenized_text) + max_len - 1) // max_len)
        ]

        # join back to sentences for each of the chunks
        sent_chunks = []
        for item in final:
            try:
                # make sure the len(items) > 1 or else some of the embeddings will appear as len 1 instead of 768.
                assert len(item) > 1
            except Exception as e:
                print(item, e)
            sent_chunks.append(item)
        return sent_chunks

    else : return sent

In [10]:
len(chunking(5, """def embed_single_row_chunking(chunk): tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]"""))

14

In [11]:
def embed_single_row(text):

    '''
    gets a tokenized text chunk (code or md) and returns embeddings tensor
    
    '''
    bert_tokenized_text = tokenizer.tokenize(text)

    tokens = [tokenizer.cls_token] + bert_tokenized_text + [tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [12]:
def embed_single_chunk(chunk):

    '''
    gets a tokenized text chunk (code or md) and returns embeddings tensor
    
    '''
    
    tokens = [tokenizer.cls_token] + chunk + [tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [13]:
def average_embeddings_on_chunks(embedding_tensor):

    '''
    gets the returned embeddings tensor of a sequence and averages it 
    to return one 768 dim vector for each row in the dataframe

    '''

    avg_rep = np.empty(1)
    if embedding_tensor is not None:
        if type(embedding_tensor) == torch.Tensor:
            # convert tensor into np array
            tensor_np = embedding_tensor.cpu().detach().numpy()
            # average of embeddings of the tokens in the sequence
            avg_rep = np.mean(tensor_np[0], axis=0)
        else: 
            tensor_np = embedding_tensor
            # average of embeddings of the tokens in the sequence
            avg_rep = np.mean(tensor_np, axis=0)
        # return average representation of a sequence of tokens
        return avg_rep

In [14]:
row_avg_embedding = []
for row in text_data.index[260000:]:
    chunks = chunking(510, text_data.loc[row, 'source'])

    # the row is under 512
    if type(chunks) == str:
        embedding = embed_single_row(chunks)
        row_avg_embedding.append((average_embeddings_on_chunks(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))

    # the row is over 512 and has been chunked
    elif type(chunks) == list:
        for chunk in chunks:
            embedding = embed_single_chunk(chunk)
            row_avg_embedding.append((average_embeddings_on_chunks(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))

    print(row)
# build intermediary dataframe of averaged row vectors, titles and tags 
interm_df01 = pd.DataFrame(row_avg_embedding, columns = ['row_vector', 'title', 'tag'])

270281
270282
270283
270284
270285
270286
270287
270288
270289
270290
270291
270292
270293
270294
270295
270296
270297
270298
270299
270300
270301
270302
270303
270304
270305
270306
270307
270308


Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


270309
270310
270311
270312
270313
270314
270315
270316
270317
270318
270319
270320
270321
270322
270323
270324
270325
270326
270327
270328
270329
270330
270331
270332
270333
270334
270335
270336
270337
270338
270339
270340
270341
270342
270343
270344
270345
270346
270347
270348
270349
270350
270351
270352
270353
270354
270355
270356
270357
270358
270359
270360
270361
270362
270363
270364
270365
270366
270367
270368
270369
270370
270371
270372
270373
270374
270375
270376
270377
270378
270379
270380
270381
270382
270383
270384
270385
270386
270387
270388
270389
270390
270391
270392
270393
270394
270395
270396
270397
270398
270399
270400
270401
270402
270403
270404
270405
270406
270407
270408
270409
270410
270411
270412
270413
270414
270415
270416
270417
270418
270419
270420
270421
270422
270423
270424
270425
270426
270427
270428
270429
270430
270431
270432
270433
270434
270435
270436
270437
270438
270439
270440
270441
270442
270443
270444
270445
270446
270447
270448
270449
270450
270451

In [None]:
interm_df01.head()

Unnamed: 0,row_vector,title,tag
0,"[-0.013696747, 0.30964553, 0.24072942, 0.18001...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
1,"[-0.36897576, 0.33527824, 0.31178835, 0.125471...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
2,"[-0.048591033, 0.31964386, 0.23311147, 0.14736...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
3,"[-0.33506474, 0.2142307, 0.21149494, 0.1118682...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
4,"[0.020527132, 0.15290816, 0.25919163, 0.374079...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision


In [None]:
interm_df01.tag.unique()

array(['computer vision', 'nlp'], dtype=object)

In [None]:
interm_df01.shape

(12007, 3)

In [None]:
interm_df01.to_pickle(DATA_PATH_OUT + 'interm_df_225000_235000.pkl')

Now let's construct the dataset of vectorized notebooks

In [None]:
pickles_dir = Path(DATA_PATH_OUT).glob('*')
vect_data = pd.DataFrame()

vect_data = pd.concat([pd.read_pickle(DATA_PATH_OUT + filename.name) for filename in pickles_dir])
vect_data.head()

Unnamed: 0,row_vector,title,tag
0,"[-0.4467445, 0.36630133, 0.3792921, -0.0237043...",learn-machine-learning-faster-1.ipynb,classification
1,"[-0.17873357, 0.394243, 0.26591498, 0.12528272...",learn-machine-learning-faster-1.ipynb,classification
2,"[0.06498895, 0.32523704, 0.13176449, 0.2967523...",learn-machine-learning-faster-1.ipynb,classification
3,"[-0.3805052, 0.32720983, 0.3625231, 0.11565907...",learn-machine-learning-faster-1.ipynb,classification
4,"[-0.49421173, 0.38245204, 0.38850525, -0.00256...",learn-machine-learning-faster-1.ipynb,classification


In [None]:
vect_data.shape

(232630, 3)

In [None]:
vect_data.reset_index(drop=True, inplace=True)

In [None]:
vect_data.index.duplicated().sum()

0

In [None]:
vect_data.shape

(232630, 3)

In [None]:
vect_data.isna().sum()

row_vector    0
title         0
tag           0
dtype: int64

In [None]:
def group_and_avg(df):
    grp_avg_embedding = []

    grp_by_title = df.groupby('title').groups
    grp_titles = list(grp_by_title.keys())
    for grp_title in grp_titles:
        # get indices range for each title
        grp_indices = grp_by_title[grp_title]
        # get row embeddings for each notebook 
        grp_vecs = df.loc[grp_indices, 'row_vector']
        # get group tag
        grp_tag = df.loc[grp_indices[0], 'tag']
        # call average method
        grp_avg_embedding.append((list(average_embeddings_on_chunks(grp_vecs)), grp_tag)) 
        
    vect_data = pd.DataFrame(grp_avg_embedding, columns = ['notebook_vector', 'tag'])
    return vect_data

In [None]:
vect_data_final = group_and_avg(vect_data)
vect_data_final.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.34790626, 0.29865852, 0.30619365, 0.055174...",computer vision
1,"[-0.3819649, 0.31716973, 0.33679396, 0.0810977...",clustering
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.27561176, 0.26736438, 0.2665188, 0.0765375...",nlp
4,"[-0.24364452, 0.28803557, 0.2663721, 0.1090390...",classification


In [None]:
vect_data_final.shape

(6260, 2)

In [None]:
vect_data_final.to_csv(DATA_PATH_OUT + 'vect_data_final.csv')