In [3]:
import os
import numpy as np
import pandas as pd

from url_sequences.sequence_manager import *
from url_sequences.sequence_plotter import *
from url_sequences.sequence_handler import *
from url_sequences.clustering_metrics import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans
from hdbscan import HDBSCAN

from gensim.models import Word2Vec

('Path of real membership file: ', '/home/chris/workspace/jupyter-notebook/url2vec/../dataset/manual-membership/urlToMembership.txt')


In [49]:
class Url2Vec:
    # Real Membership default file path
    
    # doesn't work on ipython because of __file__
    #_rmfile = os.path.abspath(os.path.dirname(__file__)) + "/dataset/manual-membership/urlToMembership.txt"
    _rmfile = "/home/chris/workspace/jupyter-notebook/url2vec/dataset/manual-membership/urlToMembership.txt"
    
    def __init__ (self, map_file, vertex_file, realmembership_file=_rmfile):
        self.codecontent_map = get_content_map(vertex_file)
        self.codeurl_map = get_urlmap(map_file)
        rm = RealMembership(fpath=realmembership_file)
        self.real_membership_list = [int(rm.get_membership(self.codeurl_map[code])) for code in self.codecontent_map]
    
    
    def word_embedding(self, sequences_file, vecs_length=48):
        vocab_sequences = get_seq(sequences_file, 1)
        train_sequences = get_seq(sequences_file, 1)
        
        w2v_model = Word2Vec(min_count=1, negative=5, size=vecs_length)
        w2v_model.build_vocab(vocab_sequences)
        w2v_model.train(train_sequences)
        return np.array([w2v_model[code] for code in self.codecontent_map])
        
        
    def tfidf(self, vecs_length=50, tfidf=True):
        self.pages_content = [self.codecontent_map[code] for code in self.codecontent_map]
        self.codes = [code for code in self.codecontent_map]
        self.longurls = [self.codeurl_map[code] for code in self.codecontent_map]
        
        tfidf_vectorizer = TfidfVectorizer(
            max_df = 0.8,
            max_features = 200000,
            min_df = 0.1,
            stop_words = 'english',
            use_idf = tfidf,
            tokenizer = tokenize_and_stem,
            ngram_range = (1,3)
        )
        tfidf_matrix = tfidf_vectorizer.fit_transform(self.pages_content)
        svd = TruncatedSVD(n_components=vecs_length, algorithm="arpack", random_state=1)
        return svd.fit_transform(tfidf_matrix)
    
    
    def train(self, algorithm="kmeans", use_w2v=True, use_tfidf=True, w2v_size=48, tfidf_size=50, sequences_file=""):
        empty_array = np.array([ [] for i in range(len(self.codecontent_map)) ])
        w2v_vecs = word_embedding(sequences_file, vecs_length=w2v_size) if use_w2v else empty_array
        tfidf_vecs = tfidf_vecs = tfidf(vecs_length=tfidf_size) if use_tfidf else empty_array
        
        return [ np.append(w2v_vecs[i], tfidf_vecs[i]) for i in range(len(self.codecontent_map)) ]
            
        
    def test(self):
        te = ""

In [50]:
nocostraint_path   = os.getcwd() + "/dataset/new/cs.illinois.eduNoConstraint.words1000.depth.10/"
vertex_nc_path     = nocostraint_path + "vertex.txt"
map_nc_path        = nocostraint_path + "urlsMap.txt"
sequences_nc       = nocostraint_path + "sequenceIDs.txt"
u2v = Url2Vec(map_nc_path, vertex_nc_path)

In [54]:
vec = u2v.tfidf()
vec2 = u2v.word_embedding(sequences_nc)

728

In [46]:
full = np.array([[5,6], [7,8]])

empty = np.array([ [] for i in range(len(full)) ])
len(np.append(full[0], empty[0]))

2