# Train seperate embedding for python keywords

Trained a separate embedding layer for python keywords using glove so that model understands and pays special attention to whitespaces, colon and other things (like comma etc)

- Used CoNala dataset from [CoNaLa: The Code/Natural Language Challenge](https://conala-corpus.github), to train the glove model.
- Faced some problem using the glove model directly into Pytorch model, so had ro convert glove model to word2vec using glove2word2vec library available in gensim
- Glove model is trained for 50 epochs and to generate embedding vectors of 256 dimension



In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
! pip install glove-python-binary



In [22]:
from glove import Corpus, Glove
import numpy as np
import io
from gensim.models import KeyedVectors
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import csv
import time
import random
import re
import os
import pickle
from io import open
import itertools
import math
import pandas as pd
import torch
import json

BASE_DIR='/content/drive/MyDrive/seq2py'
#https://stackoverflow.com/questions/62922640/calling-a-function-in-a-different-python-file-using-google-colab
%cd $BASE_DIR/utils
import preprocess, helper

/content/drive/MyDrive/seq2py/utils


In [23]:
#Then set a random seed for deterministic results/reproducability.
SEED = 2345
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

#Load source data


In [24]:
datasets = preprocess.read_data(file_name=os.path.join(BASE_DIR,'data/english_python_data_cleaned.txt'))
df_original = preprocess.tranform_to_dataframe(datasets)
df_original.head()

Unnamed: 0,Description,Code
1,write a python program to add two numbers,\nnum1 = 1.5\nnum2 = 6.3\nsum = num1 + num2\np...
2,write a python program to subtract two numbers,\nnum1 = 6\nnum2 = 3\ndiff = num1 - num2\nprin...
3,write a python function to add two user provid...,"def add_two_numbers(num1, num2):\n sum = nu..."
4,write a program to find and print the largest ...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 >= n...
5,write a program to find and print the smallest...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 <= n...


# Download CoNala corpus

In [25]:
!unzip /content/drive/MyDrive/seq2py/conala-corpus-v1.1.zip

Archive:  /content/drive/MyDrive/seq2py/conala-corpus-v1.1.zip
replace conala-corpus/conala-mined.jsonl? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: conala-corpus/conala-mined.jsonl  
  inflating: conala-corpus/conala-train.json  
  inflating: conala-corpus/conala-test.json  


In [26]:
def orginize_data(data_type):
  json_data = 'conala-corpus/' + data_type  
  path = open(json_data, "r")
  data = json.load(path)
  pairs=[]
  for dic in data:
      if dic["rewritten_intent"] is None:
          continue
      pairs.append([dic["rewritten_intent"], dic["snippet"]])
  return pairs

In [27]:
pairs= orginize_data('conala-train.json')
test_pairs=orginize_data('conala-test.json')

raw_data_train = {'Description' : [x[0] for x in pairs], 'Code': [x[1] for x in pairs]}
raw_data_test = {'Description' : [x[0] for x in test_pairs], 'Code': [x[1] for x in test_pairs]}

# raw_data = {**raw_data_train, **raw_data_test}

df_additional = pd.DataFrame(raw_data_train, columns=["Description", "Code"])
df_additional1 = pd.DataFrame(raw_data_test, columns=["Description", "Code"])

df = pd.concat([df_original, df_additional,df_additional1], ignore_index=True)

# Let's train glove model

Glove model is trained for 50 epochs and to generate embedding vectors of 256 dimension

In [31]:
def create_glove_embeddings(input):
  #Creating a corpus object
  corpus = Corpus() 

  #Training the corpus to generate the co occurence matrix which is used in GloVe
  corpus.fit(lines, window=10)
  glove = Glove(no_components=257, learning_rate=0.0005) 
  glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=True)
  glove.add_dictionary(corpus.dictionary)
  glove.save(os.path.join(BASE_DIR,'model/glove.model'))

  with io.open(os.path.join(BASE_DIR,'model/glove.model'),"w",encoding='utf-8') as f:
    for key,val in glove.dictionary.items():
      f.write(key+ " "+ re.sub("(\\n|\[|\])", "",np.array2string(glove.word_vectors[val])) +"\n")

In [32]:
#parse the source data and train glove model
lines = [preprocess.tokenize_python_code(i) for i in df['Code'].tolist()]
create_glove_embeddings(lines)


#convert glove to word2vec 
_ = glove2word2vec(glove_input_file=os.path.join(BASE_DIR,'model/glove.model'), word2vec_output_file=os.path.join(BASE_DIR,"model/emb_word2vec_format_bin.txt"))
emb_model = KeyedVectors.load_word2vec_format(os.path.join(BASE_DIR,"model/emb_word2vec_format_bin.txt"),binary=True)
word2index = {token: token_index for token_index, token in enumerate(emb_model.index2word)} 
emb_model.save_word2vec_format(os.path.join(BASE_DIR,'model/emb_word2vec_format.txt'),binary=False)

Performing 50 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49


In [30]:
cat ../model/emb_word2vec_format.txt | head -10

2128 255
 6.336922e-10 6.555213e-10 1.5603998e-19 6.3369293e-10 4.1957e-08 1.5177845e-19 6.3369293e-10 2.6077607e-09 1.501042e-19 6.3369293e-10 4.2248714e-05 1.5349568e-19 6.3369293e-10 0.00017187449 1.518015e-19 6.336922e-10 1.0255965e-08 1.5179823e-19 6.3369293e-10 1.062178e-05 1.5686395e-19 6.3369293e-10 1.1038549e-05 1.5095125e-19 6.336922e-10 1.0722533e-08 1.5179158e-19 6.336922e-10 0.00017663914 1.5602681e-19 6.3369293e-10 6.676012e-07 1.5602676e-19 6.336922e-10 0.0001690023 1.5010751e-19 6.3369293e-10 6.446926e-10 1.5264523e-19 6.336922e-10 1.6691116e-07 1.5179821e-19 6.336922e-10 2.6806337e-09 1.3569139e-19 6.3369293e-10 1.0257556e-08 1.5010085e-19 6.336922e-10 4.3914846e-05 1.5094458e-19 6.336922e-10 4.3201442e-05 1.5262546e-19 6.336922e-10 4.2007498e-05 1.526387e-19 6.3369293e-10 4.2193466e-08 1.5517314e-19 6.3369293e-10 0.00017089472 1.5178166e-19 6.3369293e-10 6.563672e-07 1.357146e-19 6.3369293e-10 1.6875204e-07 1.5263862e-19 6.3369293e-10 6.592019e-10 1.5601684e-19 6.3369

In [18]:
cat emb_word2vec_format1.txt | head -10

1402 255
 6.336922e-10 1.6877381e-07 1.5433268e-19 6.3369293e-10 1.0431038e-08 1.5263541e-19 6.3369293e-10 1.7062918e-07 1.5603669e-19 6.3369293e-10 4.102748e-08 1.5347578e-19 6.336922e-10 1.0980104e-05 1.5516978e-19 6.3369293e-10 4.148769e-08 1.3570461e-19 2.5637625e-09 1.0373285e-08 1.3571451e-19 1.025505e-08 4.1720533e-08 1.3569798e-19 2.5637625e-09 2.5785418e-09 9.107649e-12 2.5637625e-09 2.7302433e-06 9.107645e-12 6.4094063e-10 1.0490386e-08 9.106986e-12 6.4094063e-10 1.0780746e-08 4.00463e-11 1.715423e-07 2.6556195e-06 4.0057846e-11 4.200748e-05 1.06063505e-08 4.0057846e-11 0.00017470564 6.6009216e-07 4.0057846e-11 1.025596e-08 4.2193452e-08 4.0057846e-11 1.0501637e-05 4.2655493e-08 4.00463e-11 4.2485248e-05 4.1722338e-08 4.00463e-11 0.00016898732 2.5785998e-09 4.0057846e-11 2.6949003e-09 0.00017378188 4.0057846e-11 4.172142e-08 1.6874472e-07 4.00463e-11 4.14895e-08 1.0622479e-05 4.00463e-11 2.6221407e-09 6.787768e-07 4.00463e-11 1.07213936e-08 2.5933216e-09 4.00463e-11 1.0779601

https://www.tutorialexample.com/best-practice-to-create-word-embeddings-using-glove-deep-learning-