In [17]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import numpy as np
import pandas as pd
import pickle
from pathlib import Path

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [3]:
DATA_PATH_IN = 'C:\\Users\\dmasrour\\Documents\\CodeDoc_Generation\\Tasks\\Classification_Task\\data\\'
DATA_PATH_OUT = 'C:\\Users\\dmasrour\\Documents\\CodeDoc_Generation\\Tasks\\Classification_Task\\data\\Experiment_with_chunking\\'

In [4]:
with open(DATA_PATH_IN + 'text_dataset.pkl', 'rb') as f:
    text_data = pickle.load(f)

In [5]:
text_data.head()

Unnamed: 0,cell-type,source,title,tag
0,code,import pandas as pd import numpy as np import ...,0-9-try-better-parameters-better-score.ipynb,regression
1,markdown,try to overfit more pls upvote if you fork lik...,0-9-try-better-parameters-better-score.ipynb,regression
2,markdown,sub1 0 869,0-9-try-better-parameters-better-score.ipynb,regression
3,markdown,credit very simple code with score 0 886 by ay...,0-9-try-better-parameters-better-score.ipynb,regression
4,code,import pandas as pd import numpy as np from sk...,0-9-try-better-parameters-better-score.ipynb,regression


In [6]:
text_data.shape

(332354, 4)

*Credits: https://towardsdatascience.com/how-to-do-average-and-max-word-embedding-for-long-sentences-f3531e99d998*

In [7]:
def chunking(max_len, sent):
    """because the embedding function is trained on dim 512, so we have to limit the size of the sentences using max_len so the final chunked sentences wont exceed length 510
    Args:
        max_len (int): maximum number of tokens for each chunk
        sent (str): input sentence
    Returns:
        sent_chunks (List(str)): list of tokenized and chunked sentences
    """

    bert_tokenized_text = tokenizer.tokenize(sent)


    if len(bert_tokenized_text) > max_len:
        # using list comprehension to divide the sequence
        final = [
            bert_tokenized_text[i * max_len : (i + 1) * max_len]
            for i in range((len(bert_tokenized_text) + max_len - 1) // max_len)
        ]

        # join back to sentences for each of the chunks
        sent_chunks = []
        for item in final:
            try:
                # make sure the len(items) > 1 or else some of the embeddings will appear as len 1 instead of 768.
                assert len(item) > 1
            except Exception as e:
                print(item, e)
            sent_chunks.append(item)
        return sent_chunks

    else : return sent

In [12]:
len(chunking(5, """def embed_single_row_chunking(chunk): tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]"""))

14

In [8]:
def embed_single_row(text):

    '''
    gets a tokenized text chunk (code or md) and returns embeddings tensor
    
    '''
    bert_tokenized_text = tokenizer.tokenize(text)

    tokens = [tokenizer.cls_token] + bert_tokenized_text + [tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [9]:
def embed_single_chunk(chunk):

    '''
    gets a tokenized text chunk (code or md) and returns embeddings tensor
    
    '''
    
    tokens = [tokenizer.cls_token] + chunk + [tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [10]:
def average_embeddings_on_chunks(embedding_tensor):

    '''
    gets the returned embeddings tensor of a sequence and averages it 
    to return one 768 dim vector for each row in the dataframe

    '''

    avg_rep = np.empty(1)
    if embedding_tensor is not None:
        if type(embedding_tensor) == torch.Tensor:
            # convert tensor into np array
            tensor_np = embedding_tensor.cpu().detach().numpy()
            # average of embeddings of the tokens in the sequence
            avg_rep = np.mean(tensor_np[0], axis=0)
        else: 
            tensor_np = embedding_tensor
            # average of embeddings of the tokens in the sequence
            avg_rep = np.mean(tensor_np, axis=0)
        # return average representation of a sequence of tokens
        return avg_rep

In [58]:
row_avg_embedding = []
for row in text_data.index[225000:235000]:
    chunks = chunking(510, text_data.loc[row, 'source'])

    # the row is under 512
    if type(chunks) == str:
        embedding = embed_single_row(chunks)
        row_avg_embedding.append((average_embeddings_on_chunks(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))

    # the row is over 512 and has been chunked
    elif type(chunks) == list:
        for chunk in chunks:
            embedding = embed_single_chunk(chunk)
            row_avg_embedding.append((average_embeddings_on_chunks(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))

    print(row)
# build intermediary dataframe of averaged row vectors, titles and tags 
interm_df01 = pd.DataFrame(row_avg_embedding, columns = ['row_vector', 'title', 'tag'])

234347
234348
234349
234350
234351
234352
234353
234354
234355
234356
234357
234358
234359
234360
234361
234362
234363
234364
234365
234366
234367
234368
234369
234370
234371
234372
234373
234374
234375
234376
234377
234378
234379
234380
234381
234382
234383
234384
234385
234386
234387
234388
234389
234390
234391
234392
234393
234394
234395
234396
234397
234398
234399
234400
234401
234402
234403
234404
234405
234406
234407
234408
234409
234410
234411
234412
234413
234414
234415
234416
234417
234418
234419
234420
234421
234422
234423
234424
234425
234426
234427
234428
234429
234430
234431
234432
234433
234434
234435
234436
234437
234438
234439
234440
234441
234442
234443
234444
234445
234446
234447
234448
234449
234450
234451
234452
234453
234454
234455
234456
234457
234458
234459
234460
234461
234462
234463
234464
234465
234466
234467
234468
234469
234470
234471
234472
234473
234474
234475
234476
234477
234478
234479
234480
234543
234544
234545
234546
234547
234548
234549
234550
234551

In [59]:
interm_df01.head()

Unnamed: 0,row_vector,title,tag
0,"[-0.013696747, 0.30964553, 0.24072942, 0.18001...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
1,"[-0.36897576, 0.33527824, 0.31178835, 0.125471...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
2,"[-0.048591033, 0.31964386, 0.23311147, 0.14736...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
3,"[-0.33506474, 0.2142307, 0.21149494, 0.1118682...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision
4,"[0.020527132, 0.15290816, 0.25919163, 0.374079...",siim-covid19-fastai-efficientnetv2-timm-models...,computer vision


In [60]:
interm_df01.tag.unique()

array(['computer vision', 'nlp'], dtype=object)

In [61]:
interm_df01.shape

(12007, 3)

In [62]:
interm_df01.to_pickle(DATA_PATH_OUT + 'interm_df_225000_235000.pkl')

Now let's construct the dataset of vectorized notebooks

In [42]:
pickles_dir = Path(DATA_PATH_OUT).glob('*')
vect_data = pd.DataFrame()

vect_data = pd.concat([pd.read_pickle(DATA_PATH_OUT + filename.name) for filename in pickles_dir])
vect_data.head()

Unnamed: 0,row_vector,title,tag
0,"[-0.4467445, 0.36630133, 0.3792921, -0.0237043...",learn-machine-learning-faster-1.ipynb,classification
1,"[-0.17873357, 0.394243, 0.26591498, 0.12528272...",learn-machine-learning-faster-1.ipynb,classification
2,"[0.06498895, 0.32523704, 0.13176449, 0.2967523...",learn-machine-learning-faster-1.ipynb,classification
3,"[-0.3805052, 0.32720983, 0.3625231, 0.11565907...",learn-machine-learning-faster-1.ipynb,classification
4,"[-0.49421173, 0.38245204, 0.38850525, -0.00256...",learn-machine-learning-faster-1.ipynb,classification


In [43]:
vect_data.shape

(232630, 3)

In [44]:
vect_data.reset_index(drop=True, inplace=True)

In [45]:
vect_data.index.duplicated().sum()

0

In [46]:
vect_data.shape

(232630, 3)

In [47]:
vect_data.isna().sum()

row_vector    0
title         0
tag           0
dtype: int64

In [None]:
def group_and_avg(df):
    grp_avg_embedding = []

    grp_by_title = df.groupby('title').groups
    grp_titles = list(grp_by_title.keys())
    for grp_title in grp_titles:
        # get indices range for each title
        grp_indices = grp_by_title[grp_title]
        # get row embeddings for each notebook 
        grp_vecs = df.loc[grp_indices, 'row_vector']
        # get group tag
        grp_tag = df.loc[grp_indices[0], 'tag']
        # call average method
        grp_avg_embedding.append((list(average_embeddings(grp_vecs)), grp_tag)) 
        
    vect_data = pd.DataFrame(grp_avg_embedding, columns = ['notebook_vector', 'tag'])
    return vect_data

In [None]:
vect_data_final = group_and_avg(vect_data)
vect_data_final.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.34790626, 0.29865852, 0.30619365, 0.055174...",computer vision
1,"[-0.3819649, 0.31716973, 0.33679396, 0.0810977...",clustering
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.27561176, 0.26736438, 0.2665188, 0.0765375...",nlp
4,"[-0.24364452, 0.28803557, 0.2663721, 0.1090390...",classification


In [None]:
vect_data_final.shape

(6260, 2)

In [None]:
vect_data_final.to_csv(DATA_PATH + 'vect_data_final.csv')