# Introduction

In this notebook , we will proceed by chunking before embeding part since codeBert can only accept 512 tokens .

## Importing librairies

In [2]:
# Used to import codebert model and tokenizer
from transformers import AutoTokenizer, AutoModel, pipeline
import torch

# Used to work with arrays
import numpy as np

# Used to manipule and analyse data
import pandas as pd

# Used to manipule and analyse pickle files
import pickle

# Used to deal with paths
from pathlib import Path

## Embedding the data using codeBERT

Load cobeBERT base model and tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

Path to Data

In [4]:
DATA_PATH_IN = "../data"
DATA_PATH_OUT = "../data"

In [5]:
with open(DATA_PATH_IN + 'text_dataset.pkl', 'rb') as f:
    text_data = pickle.load(f)

In [6]:
text_data.head()

Unnamed: 0,cell-type,source,title,tag
0,code,import pandas as pd import numpy as np import ...,0-9-try-better-parameters-better-score.ipynb,regression
1,markdown,try to overfit more pls upvote if you fork lik...,0-9-try-better-parameters-better-score.ipynb,regression
2,markdown,sub1 0 869,0-9-try-better-parameters-better-score.ipynb,regression
3,markdown,credit very simple code with score 0 886 by ay...,0-9-try-better-parameters-better-score.ipynb,regression
4,code,import pandas as pd import numpy as np from sk...,0-9-try-better-parameters-better-score.ipynb,regression


In [7]:
text_data.shape

(332354, 4)

## Chunking

*Credits: https://towardsdatascience.com/how-to-do-average-and-max-word-embedding-for-long-sentences-f3531e99d998*

In [8]:
def chunking(max_len, sent):
    """because the embedding function is trained on dim 512, so we have to limit the size of the sentences using max_len so the final chunked sentences wont exceed length 510
    Args:
        max_len (int): maximum number of tokens for each chunk
        sent (str): input sentence
    Returns:
        sent_chunks (List(str)): list of tokenized and chunked sentences
    """

    bert_tokenized_text = tokenizer.tokenize(sent)


    if len(bert_tokenized_text) > max_len:
        # using list comprehension to divide the sequence
        final = [
            bert_tokenized_text[i * max_len : (i + 1) * max_len]
            for i in range((len(bert_tokenized_text) + max_len - 1) // max_len)
        ]

        # join back to sentences for each of the chunks
        sent_chunks = []
        for item in final:
            try:
                # make sure the len(items) > 1 or else some of the embeddings will appear as len 1 instead of 768.
                assert len(item) > 1
            except Exception as e:
                print(item, e)
            sent_chunks.append(item)
        return sent_chunks

    else : return sent

In [9]:
len(chunking(5, """def embed_single_row_chunking(chunk): tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]"""))

14

## Embedding

In [10]:
def embed_single_row(text):

    '''
    gets a tokenized text chunk (code or md) and returns embeddings tensor
    
    '''
    bert_tokenized_text = tokenizer.tokenize(text)

    tokens = [tokenizer.cls_token] + bert_tokenized_text + [tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [11]:
def embed_single_chunk(chunk):

    '''
    gets a tokenized text chunk (code or md) and returns embeddings tensor
    
    '''
    
    tokens = [tokenizer.cls_token] + chunk + [tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [12]:
def average_embeddings_on_chunks(embedding_tensor):

    '''
    gets the returned embeddings tensor of a sequence and averages it 
    to return one 768 dim vector for each row in the dataframe

    '''

    avg_rep = np.empty(1)
    if embedding_tensor is not None:
        if type(embedding_tensor) == torch.Tensor:
            # convert tensor into np array
            tensor_np = embedding_tensor.cpu().detach().numpy()
            # average of embeddings of the tokens in the sequence
            avg_rep = np.mean(tensor_np[0], axis=0)
        else: 
            tensor_np = embedding_tensor
            # average of embeddings of the tokens in the sequence
            avg_rep = np.mean(tensor_np, axis=0)
        # return average representation of a sequence of tokens
        return avg_rep

In [13]:
row_avg_embedding = []
for row in text_data.index[255000:260000]:
    chunks = chunking(510, text_data.loc[row, 'source'])

    # the row is under 512
    if type(chunks) == str:
        embedding = embed_single_row(chunks)
        row_avg_embedding.append((average_embeddings_on_chunks(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))

    # the row is over 512 and has been chunked
    elif type(chunks) == list:
        for chunk in chunks:
            embedding = embed_single_chunk(chunk)
            row_avg_embedding.append((average_embeddings_on_chunks(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))

    print(row)
# build intermediary dataframe of averaged row vectors, titles and tags 
interm_df01 = pd.DataFrame(row_avg_embedding, columns = ['row_vector', 'title', 'tag'])

Token indices sequence length is longer than the specified maximum sequence length for this model (1322 > 512). Running this sequence through the model will result in indexing errors


265183
265184
265185
265186
265187
265188
265189
265190
265191
265192
265193
265194
265195
265196
265197
265198
265199
265200
265201
265202
265203
265204
265205
265206
265207
265208
265209
265210
265211
265212
265213
265214
265215
265216
265217
265218
265219
265220
265221
265222
265223
265224
265225
265226
265227
265228
265229
265230
265231
265232
265233
265234
265235
265236
265237
265238
265239
265240
265241
265242
265243
265244
265245
265246
265247
265248
265249
265250
265251
265252
265253
265254
265255
265256
265257
265258
265259
265260
265261
265262
265263
265264
265265
265266
265267
265268
265269
265270
265271
265272
265273
265274
265275
265276
265277
265278
265279
265280
265281
265282
265283
265284
265285
265286
265287
265288
265289
265290
265291
265292
265293
265294
265295
265296
265297
265298
265299
265300
265301
265302
265303
265304
265305
265306
265307
265308
265309
265310
265311
265312
265313
265314
265315
265316
265317
265318
265319
265320
265321
265322
265323
265324
265325

In [14]:
interm_df01.head()

Unnamed: 0,row_vector,title,tag
0,"[-0.09458804, 0.13385853, 0.22589089, 0.055878...",data-science-in-2021-adaptation-or-adoption.ipynb,nlp
1,"[-0.041827433, 0.10072717, 0.2360467, 0.083392...",data-science-in-2021-adaptation-or-adoption.ipynb,nlp
2,"[-0.01881382, 0.07125372, 0.23361225, 0.025474...",data-science-in-2021-adaptation-or-adoption.ipynb,nlp
3,"[-0.42020383, 0.16479528, 0.2807469, 0.1474800...",data-science-in-2021-adaptation-or-adoption.ipynb,nlp
4,"[-0.363763, 0.32521477, 0.3240124, 0.015629904...",data-science-in-2021-adaptation-or-adoption.ipynb,nlp


In [15]:
interm_df01.tag.unique()

array(['nlp'], dtype=object)

In [16]:
interm_df01.shape

(5406, 3)

## Save Data

In [17]:
interm_df01.to_pickle(DATA_PATH_OUT + 'interm_df_25000_260000.pkl')

In [18]:
pickles_dir = Path(DATA_PATH_OUT).glob('*')
vect_data = pd.DataFrame()

vect_data = pd.concat([pd.read_pickle(DATA_PATH_OUT + filename.name) for filename in pickles_dir])
vect_data.head()

Unnamed: 0,row_vector,title,tag
0,"[-0.4467445, 0.36630133, 0.3792921, -0.0237043...",learn-machine-learning-faster-1.ipynb,classification
1,"[-0.17873357, 0.394243, 0.26591498, 0.12528272...",learn-machine-learning-faster-1.ipynb,classification
2,"[0.06498895, 0.32523704, 0.13176449, 0.2967523...",learn-machine-learning-faster-1.ipynb,classification
3,"[-0.3805052, 0.32720983, 0.3625231, 0.11565907...",learn-machine-learning-faster-1.ipynb,classification
4,"[-0.49421173, 0.38245204, 0.38850525, -0.00256...",learn-machine-learning-faster-1.ipynb,classification


In [19]:
vect_data.shape

(361802, 3)

In [20]:
vect_data.reset_index(drop=True, inplace=True)

In [21]:
vect_data.index.duplicated().sum()

0

In [22]:
vect_data.shape

(361802, 3)

In [23]:
vect_data.isna().sum()

row_vector    0
title         0
tag           0
dtype: int64

## Average embeddings per title

In [24]:
def group_and_avg(df):
    grp_avg_embedding = []

    grp_by_title = df.groupby('title').groups
    grp_titles = list(grp_by_title.keys())
    for grp_title in grp_titles:
        # get indices range for each title
        grp_indices = grp_by_title[grp_title]
        # get row embeddings for each notebook 
        grp_vecs = df.loc[grp_indices, 'row_vector']
        # get group tag
        grp_tag = df.loc[grp_indices[0], 'tag']
        # call average method
        grp_avg_embedding.append((list(average_embeddings_on_chunks(grp_vecs)), grp_tag)) 
        
    vect_data = pd.DataFrame(grp_avg_embedding, columns = ['notebook_vector', 'tag'])
    return vect_data

In [25]:
vect_data_final = group_and_avg(vect_data)
vect_data_final.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.35992935, 0.2910624, 0.30088913, 0.0578330...",computer vision
1,"[-0.37886396, 0.31178004, 0.33021998, 0.095746...",clustering
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.3596119, 0.28623548, 0.29093447, 0.0510302...",nlp
4,"[-0.24906486, 0.28819692, 0.26886925, 0.107111...",classification


In [26]:
vect_data_final.shape

(6260, 2)

## Save Data

In [27]:
vect_data_final.to_csv(DATA_PATH_OUT + 'vect_data_final_chunking.csv')