<a href="https://colab.research.google.com/github/dude123studios/AdvancedDeepLearning/blob/main/Item2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gensim
import logging
import numpy as np
import os
import shutil
import tensorflow as tf

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity 

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
DATA_DIR = './data'
UCI_DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00371/NIPS_1987-2015.csv'

def download_and_read(url):
  local_file = url.split('/')[-1]
  p = tf.keras.utils.get_file(local_file, url, cache_dir='.')
  row_ids, col_ids, data = [], [], []
  rid = 0
  f = open(p, 'r')
  for line in f:
    line = line.strip()
    if line.startswith("\"\"\","):
      continue
    counts = np.array([int(x) for x in line.split(',')[1:]])
    nz_col_idz = np.nonzero(counts)[0]
    nz_data = counts[nz_col_idz]
    nz_row_ids = np.repeat(rid,len(nz_col_idz))
    rid += 1
    row_ids.extend(nz_row_ids.tolist())
    col_ids.extend(nz_col_idz.tolist())
    data.extend(nz_data.tolist())
  f.close()
  TD = csr_matrix((
    np.array(data), (
        np.array(row_ids),np.array(col_ids)
    ), shape = (rid,counts.shape[0]))

TD = download_and_read(UCI_DATA_URL)

E = TD.T * TD

E[E>0] = 1

In [None]:
NUM_WALKS_PER_VERTEX = 32
MAX_LEN = 40
RESTART_CHANCE = 0.15

RANDOM_WALKS = os.path.join(DATA_DIR, 'random-walks.txt')
def construct_random_walks(E, n, alpha, l, ofile):
  if os.path.exists(ofile):
    print('random walks already generated....')
    return
  f = open(ofile, 'w')
  for i in range(E.shape[0]):
    if i % 100 == 0:
      print('{:d} random walks generated from {:d} vertices'.format(n*i,i))
    for j in range(n):
      curr = i
      walk = [curr]
      target_nodes = np.nonzero(E[curr])[l]
      for k in range(l):
        if np.random.random() < alpha and len(walk) > 5:
          break
        try:
          curr = np.random.choice(target_nodes)
          walk.append(curr)
          target_nodes = np.nonzero(E[curr])[l]
        except ValueError:
          continue
      f.write('{:s}\n'.format(' '.join([str(x) for x in walk])))
  print('{:d} random walks generated from {:d} vertices, COMPLETE'.format(n*i,i))
  f.close()

construct_random_walks(E, NUM_WALKS_PER_VERTEX, RESTART_CHANCE, MAX_LEN, RANDOM_WALKS)
#This takes a REAALLY long time so I will skip this

In [None]:
W2V_MODEL_FILE = os.path.join(DATA_DIR,'w2v-neurips-papers.model')
class Documents(object):
  def __init__(self, input_file):
    self.input_file = input_file

  def __iter__(self):
    with open(self.input_file, 'r') as f:
      for i, line in enumerate(f):
        if i % 1000 == 0:
          logging.info('{:d} random walks extracted'.format(i))
        yield line.strip().split()   

In [3]:
def train_word2vec_model(random_walks_file, model_file):
  if os.path.exists(model_file):
    print('Model file {:s} already exists, skipping training ...'.format(model_file))
    return

  docs = Documents(random_walks_file)
  model = gensim.models.Word2Vec(docs, size=128,window=10, sg=1,min_count=2,workers=4)

  model.train(docs, total_examples=model.corpus_count,
              epoch=50)
  model.save(model_file)

train_word2vec_model(RANDOM_WALKS,W2V_MODEL_FILE)