In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = 'csv/testdex.csv'

df = pd.read_csv(data_path)

In [3]:
df = df[['tag','word']]

In [4]:
#Convert CSV to nltk word ,tag form
corpus= []
temp = []
for word,tag in zip(df['word'],df['tag']):
    if word != '.':
#         word = stemmer.stem(str(word))
        temp.append((word,tag))
    else:
        corpus.append(temp)
        temp=[]
print("Sample document: ",corpus[0])

Sample document:  [('महिला', 'NN'), ('समालोचक', 'NN'), ('र', 'CC'), ('नेपाली', 'JX'), ('समालोचना', 'NN'), ('।', 'YF')]


In [5]:
#Seperate word and tags into attributes and labels
sentences = []
sentence_tags = []

for sentence in corpus:
    if len(sentence)<200:
        x=[]
        y=[]
        for word in sentence:
            x.append(word[0])
            y.append(word[1])
        if len(x) > 0:
            sentences.append(x)
            sentence_tags.append(y)
print("Sample sentence: ",sentences[10])
print("Sample sentence tags: ",sentence_tags[10])

Sample sentence:  ['समालोचनालेखन', 'का', 'लागि', 'पर्याप्त', 'अध्ययन', 'को', 'आवश्यकता', 'मात्र', 'ले', 'पुग्दैन', ',', 'विवेचना', 'र', 'विश्लेषण', 'गर्ने', 'क्षमता', 'को', 'पनि', 'आवश्यकता', 'पर्दछ', '।', '।']
Sample sentence tags:  ['NN', 'IKO', 'II', 'JX', 'NN', 'IKM', 'NN', 'TT', 'IE', 'VVYN1', 'YM', 'NN', 'CC', 'NN', 'VN', 'NN', 'IKM', 'TT', 'NN', 'VVYN1', 'YF', 'YF']


In [6]:
#Convert labels to numbers
labels = set()
for sentence in sentence_tags:
    for tag in sentence:
        labels.add(tag)
        
tag2index = {t: i + 1 for i, t in enumerate(list(labels))}
print("Total number of tags: ",labels)

Total number of tags:  {'JX', 'IKM', 'VOYN1', 'DGF', 'DKM', 'VDF', 'NN', 'PXR', 'PMXKX', 'PRFKM', 'MLX', 'DDX', 'RK', 'DKX', 'DGM', 'VDX', 'VOMX2', 'VI', 'VDM', 'PMXKF', 'DGX', 'IH', 'VVYX2', 'PTMKM', 'VCN', 'DGO', 'PRFKF', 'CSB', 'IKO', 'MLO', 'JT', 'VDO', 'DDM', 'VCM', 'PRFKX', 'MOX', 'PMXKO', 'FF', 'JM', 'RJ', 'DJM', 'TT', 'FZ', 'MM', 'PMX', 'VVTX2', 'FU', 'PXH', 'NP', 'DKF', 'PMXKM', 'QQ', 'IKX', 'DKO', 'RR', 'UU', 'YB', 'PTH', 'YQ', 'RD', 'PTMKF', 'MLF', 'VR', 'PRF', 'PRFKO', 'FS', 'PTM', 'VVTN1', 'VVMX2', 'YF', 'FO', 'YM', 'PTNKX', 'VOMX1', 'II', 'FB', 'CSA', 'VE', 'VVYN1', 'VVYM1F', 'VVYN1F', 'IA', 'PTN', 'PTMKX', 'VN', 'CC', 'VCH', 'IKF', 'DJX', 'JF', 'IE', 'DDO', 'PTMKO', 'PTNKM', 'VVMX1', 'JO', 'VS', 'VOYX2', 'VVTN1F', 'MOM', 'VQ'}


In [7]:
def tagsent2int(sent_tag):
    
    return [tag2index[tag] for tag in sent_tag]

sentence_tags = list(map(tagsent2int,sentence_tags))

In [8]:
#Convert label nums to categorical
def to_categorical(sequences,categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [9]:
#Preprocess text to numbers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
t = Tokenizer(lower=False, oov_token='-PAD-')
t.fit_on_texts(sentences)
vocab_size = len(t.word_index) + 1

encoded_docs = t.texts_to_sequences(sentences)

print("Sample encoded doc 0 : ",encoded_docs[:2]," Shape: " ,np.asarray(encoded_docs).shape)

max_length = len(max(sentences,key=len))

padded_docs = pad_sequences(encoded_docs[:2], maxlen=max_length, padding='post')


Using TensorFlow backend.


Sample encoded doc 0 :  [[129, 811, 8, 27, 127, 2], [2530, 1788, 2]]  Shape:  (16633,)


In [10]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [20]:
new embeddings_index.keys()

TypeError: 'dict_keys' object is not subscriptable

In [18]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
#     print(word)
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [13]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])