In [73]:
import json
import spacy

import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer

nlp_spacy_en = None
nlp_spacy_es = None

def get_spacy_model(lang="en"):
    global nlp_spacy_en
    global nlp_spacy_es
    if lang == "en":
        if nlp_spacy_en is None: 
            nlp_spacy_en = spacy.load(lang)
        return nlp_spacy_en
    elif lang == "es":
        if nlp_spacy_es is None: 
            nlp_spacy_es = spacy.load(lang)
        return nlp_spacy_es

In [74]:
class Dataset():
    
    def __init__(self, dataset_path, n_splits=3, ratio=0.3, augment=False):
        self.dataset_path = dataset_path
        self.augment = augment
        self.n_splits = n_splits
        self.ratio = ratio
        self.X, self.y = self.load()
        self.splits = self.stratified_split(self.X, self.y, self.n_splits, self.ratio)
    
    def load(self):
        with open(self.dataset_path, "r") as f:
            dataset = json.load(f)
            X = [sample["text"] for sample in dataset["sentences"]]
            y = [sample["intent"] for sample in dataset["sentences"]]
        return X, y
    
    def stratified_split(self, X, y, n_splits=10, test_size=0.2):
        skf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
        skf.get_n_splits(X, y)
        splits = []
        for train_index, test_index in skf.split(X, y):
            # print("TRAIN:", train_index, "\n\n", "TEST:", test_index, "\n\n")
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            # add augmentation code here
            splits.append({"train": {"X": X_train, "y": y_train},
                           "test": {"X": X_test, "y": y_test}})
        return splits
    
    def get_splits(self):
        return self.splits 

In [75]:
dataset = Dataset("/home/dash/projects/imli/data/datasets/AskUbuntuCorpus.json")
splits = dataset.get_splits()
for split in splits:
    print("X train", split["train"]["X"][: 2])
    print("y train", split["train"]["y"][:2])
    print("X test", split["test"]["X"][: 2])
    print("y test", split["test"]["y"][:2])

X train ['Are there any hardware diagnostic tools?', 'Is there a tool like wifi analyzer for ubuntu?']
y train ['Software Recommendation', 'Software Recommendation']
X test ['How can I shutdown the computer when a certain process ends?', 'What are some good PHP editors?']
y test ['Shutdown Computer', 'Software Recommendation']
X train ["What does my computer do when I click 'Shut Down'?", 'Torrent client for the command-line?']
y train ['Shutdown Computer', 'None']
X test ['Cannot setup HP All in one DJ3630', 'How to record my screen?']
y test ['Setup Printer', 'None']
X train ['How is rm command different from the delete button?', 'Cannot install printer driver epson l210']
y train ['None', 'Setup Printer']
X test ['How to partially upgrade Ubuntu 11.10 from Ubuntu 11.04?', 'Is there any program for fuzzy string matching which provides a match score?']
y test ['Make Update', 'Software Recommendation']


In [76]:
def find_ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

def semhash_tokenizer(text):
    tokens = text.split(" ")
    final_tokens = []
    for unhashed_token in tokens:
        hashed_token = "#{}#".format(unhashed_token)
        final_tokens += [''.join(gram)
                         for gram in list(find_ngrams(list(hashed_token), 3))]
    return final_tokens

class SemhashFeaturizer:
    def __init__(self):
        self.vectorizer = self.get_vectorizer()
    
    def get_vectorizer(self):
        return TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False,
                               sublinear_tf=True, tokenizer=semhash_tokenizer)
    
    def fit(self, X, *args, **kwargs):
        self.vectorizer.fit(X)
        
    
    def transform(self, X):
        return self.vectorizer.transform(X).toarray()

In [77]:
X, y = ["hello", "I am a boy"], ["A", "B"]

semhash_featurizer = SemhashFeaturizer()
semhash_featurizer.fit(X, y)
X_ = semhash_featurizer.transform(X)
print(X_)

[[0.         0.         0.         0.4472136  0.         0.
  0.         0.4472136  0.4472136  0.4472136  0.4472136  0.        ]
 [0.37796447 0.37796447 0.37796447 0.         0.37796447 0.37796447
  0.37796447 0.         0.         0.         0.         0.37796447]]


In [80]:
class W2VFeaturizer:
    def __init__(self, lang):
        self.lang = lang
    
    def fit(self, X, *args, **kwargs):
        pass
    
    def transform(self, x):
        return np.array([get_spacy_model(self.lang)(s).vector for s in x])

In [82]:
X, y = ["hello", "I am a boy"], ["A", "B"]
glove_path = ""
w2v_featurizer = W2VFeaturizer("en")
w2v_featurizer.fit(X, y)
X_ = w2v_featurizer.transform(X)
print(X_.shape)

(2, 384)
