In [1]:
!pip install gensim --upgrade

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.6 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/NLP/03")
!ls

Mounted at /content/drive
ann_word2vec_12.h5	  word2vec.model
dataset-1.csv		  word2vec.model.syn1neg.npy
dataset-wiki.csv	  word2vec.model.wv.vectors.npy
label_encoder.pickle	  word2vec_wiki_12.model
lang-detect2.ipynb	  word2vec_wiki_12.model.syn1neg.npy
lang-detect3.ipynb	  word2vec_wiki_12.model.wv.vectors.npy
lang-detect4.ipynb	  word2vec-wiki-ann.h5
lang-detect-clean.ipynb   word2vec_wiki_ann.model
lang-detect.ipynb	  word2vec_wiki_lstm
tfidf-lstm-13.pickle	  word2vec-wiki-lstm.h5
tfidf-lstm-model-13.h5	  word2vec_wiki_lstm.model
tfidf-nb-14-model.pickle  word2vec_wiki.model.syn1neg.npy
tfidf-nb-14.pickle	  word2vec_wiki.model.wv.vectors.npy


In [11]:
!ls /content/drive/MyDrive/NLP/Tubes

bin  data


In [3]:
# load label encoder
import pickle
with open('label_encoder.pickle', 'rb') as handle:
    lang_encoder = pickle.load(handle)

In [4]:
# load w2v
from gensim.models import Word2Vec
w2v_lang_model = Word2Vec.load("word2vec_wiki_12.model")

In [5]:
# load clf
import tensorflow as tf
lang_model = tf.keras.models.load_model('ann_word2vec_12.h5')
lang_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 500)               100500    
                                                                 
 dense_1 (Dense)             (None, 500)               250500    
                                                                 
 dense_2 (Dense)             (None, 250)               125250    
                                                                 
 dense_3 (Dense)             (None, 10)                2510      
                                                                 
Total params: 478,760
Trainable params: 478,760
Non-trainable params: 0
_________________________________________________________________


In [6]:
# preprocesses - 1 - regex lc
import re
def regex_filter(text):
  text = re.sub(r'[!@#$(),n"%^*?:;~`0-9.†]', ' ', text)
  text = re.sub(r'[[]]', ' ', text)
  text = text.lower()
  return text

In [7]:
# preprocess - 2 - tokenizer
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# preprocesses - 3 - vectorize
import numpy as np
def word_vector(tokens, size):
  vec = np.zeros(size).reshape((1, size))
  count = 0
  for word in tokens:
    try:
      vec += w2v_lang_model.wv[word].reshape((1, size))
      count += 1
    except KeyError:  # handling the case where the token is not in vocabulary
      continue
  if count != 0:
    vec /= count
  else:
    print('WARNING: all OOV')
  return vec

In [9]:
# combine
def preprocess(sentence):
  clean_sent = regex_filter(sentence)
  return word_vector(word_tokenize(clean_sent), 200)

In [10]:
def predict(text):
  clean = regex_filter(text)
  x = word_vector(word_tokenize(clean), 200)
  lang = lang_model.predict(x)
  pred = np.zeros_like(lang)
  pred[np.arange(len(lang)), lang.argmax(1)] = 1
  pred_lang = lang_encoder.inverse_transform(pred)
  return pred_lang[0]

In [13]:
predict("nama saya adalah my name is")

'ind'

In [14]:
predict("this movie is good")

'eng'