In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/NepaliWord2vec

/content/drive/MyDrive/NepaliWord2vec


## Importing the Dataset

In [6]:
import string
import time
import re
import snowballstemmer
import gensim

import setting as cfg

## Reading the File

In [4]:
start = time.process_time() 
print("Reading the file .......")
f = open(cfg.raw_text_files , encoding= 'utf-8' , buffering= 10000)
lines = f.read().strip().split(u"।")
sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in lines]
# f.close()
print(f"Total number of lines in text file {len(sentences)}")
print(f"Time required to read the file {time.process_time() - start}")

Reading the file .......
Total number of lines in text file 5891518
Time required to read the file 68.18159049600001


In [5]:
mainlist = list()

class Main_Data_list:
    def __init__(self, dataset):
        self.dataset = dataset
        self.stop_word_list = []
        self.mainlist = []
        
        a_file = open('input/stopwords/stopwords.txt', "r" ,encoding= 'utf-8')
        for line in a_file:
            stripped_line = line.strip()
            self.stop_word_list.append(stripped_line)
        a_file.close()
        
        self.stemmer = snowballstemmer.NepaliStemmer()
        
        
    def simple_tokenizer(self,text) -> list:
        
        line = re.sub('[।]',"", text)
        
        devanagari_range = r'[\u0900-\u097F\\]'
        def getDevanagariCharCount(token):
            return len(list(filter(lambda char: re.match(devanagari_range, char), (char for char in token))))
        def isDevanagari(token):
            return True if getDevanagariCharCount(token) >= len(token)/2 else False 

        tokens = list(filter(lambda t: isDevanagari(t), line.split(" ")))
        return tokens

    def get(self):
        for i,line in enumerate(self.dataset[0:2000000]):
            
            wordsList = self.simple_tokenizer(line)
            words = [w for w in wordsList if not w in self.stop_word_list]
            words  = self.stemmer.stemWords(words)
            if len(words) > 3:
                self.mainlist.append(words)
            if i % 100000 == 0:
                print(f"DONE FOR {i/100000} LAKHS LINES")
        return self.mainlist
                
final = Main_Data_list(sentences)
mainlist = final.get()

DONE FOR 0.0 LAKHS LINES
DONE FOR 1.0 LAKHS LINES
DONE FOR 2.0 LAKHS LINES
DONE FOR 3.0 LAKHS LINES
DONE FOR 4.0 LAKHS LINES
DONE FOR 5.0 LAKHS LINES
DONE FOR 6.0 LAKHS LINES
DONE FOR 7.0 LAKHS LINES
DONE FOR 8.0 LAKHS LINES
DONE FOR 9.0 LAKHS LINES
DONE FOR 10.0 LAKHS LINES
DONE FOR 11.0 LAKHS LINES
DONE FOR 12.0 LAKHS LINES
DONE FOR 13.0 LAKHS LINES
DONE FOR 14.0 LAKHS LINES
DONE FOR 15.0 LAKHS LINES
DONE FOR 16.0 LAKHS LINES
DONE FOR 17.0 LAKHS LINES
DONE FOR 18.0 LAKHS LINES
DONE FOR 19.0 LAKHS LINES


## Training

In [10]:
model = gensim.models.Word2Vec(
    window=  5,
    min_count=2,
    workers= 4,
    size = 200,
)

model.build_vocab(mainlist, progress_per=1000 )

model.train(mainlist, total_examples= model.corpus_count, epochs= model.epochs)

(103822451, 122202960)

## Testing

In [11]:
model.wv.most_similar('जमल')

[('पुतलीसडक', 0.8554033637046814),
 ('बानेश्वर', 0.8238137364387512),
 ('बागबजार', 0.8135662078857422),
 ('मैतीदेवी', 0.7989574074745178),
 ('गोंगबु', 0.7972407937049866),
 ('कलं', 0.7865052223205566),
 ('त्रिपुरेश्वर', 0.785507321357727),
 ('लैनचौर', 0.7765045166015625),
 ('तीनकु', 0.776380717754364),
 ('लाजिम्पाट', 0.7671419382095337)]

In [12]:
model.wv.most_similar('काठमाडौं')

[('काठमाडांै', 0.6255369186401367),
 ('काठमाडौ', 0.6137765645980835),
 ('काठमाडाैं', 0.5868496894836426),
 ('पोखरा', 0.5453355312347412),
 ('ललितपुर', 0.5321687459945679),
 ('भलामस्थित', 0.5107089281082153),
 ('स्वात्त', 0.5079240798950195),
 ('विराटनगर', 0.5032570362091064),
 ('काठमाडौं\nकाठमाडौँ', 0.49517786502838135),
 ('भरतपुर', 0.4644666910171509)]

In [15]:
model.wv.most_similar('सुनिल')

[('सुनील', 0.8596311211585999),
 ('मनिष', 0.7828516364097595),
 ('बिक्रम', 0.782561182975769),
 ('प्रदिप', 0.7773913145065308),
 ('सुरज', 0.7743733525276184),
 ('समिर', 0.7712180614471436),
 ('धिरज', 0.7703405618667603),
 ('रवि', 0.766444206237793),
 ('दिपेश', 0.7663419246673584),
 ('सुशिल', 0.7656646966934204)]

In [16]:
model.save("nepaliW2V_5Million.model")