In [8]:
import json

from sklearn.model_selection import StratifiedShuffleSplit

In [17]:
class Dataset():
    
    def __init__(self, dataset_path, n_splits=3, ratio=0.3):
        self.dataset_path = dataset_path
        self.n_splits = n_splits
        self.ratio = ratio
        self.X, self.y = self.load()
        self.splits = self.stratified_split(self.X, self.y, self.n_splits, self.ratio)
    
    def load(self):
        with open(self.dataset_path, "r") as f:
            dataset = json.load(f)
            X = [sample["text"] for sample in dataset["sentences"]]
            y = [sample["intent"] for sample in dataset["sentences"]]
        return X, y
    
    def stratified_split(self, X, y, n_splits=10, test_size=0.2):
        skf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
        skf.get_n_splits(X, y)
        splits = []
        for train_index, test_index in skf.split(X, y):
            # print("TRAIN:", train_index, "\n\n", "TEST:", test_index, "\n\n")
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            splits.append({"train": {"X": X_train, "y": y_train},
                           "test": {"X": X_test, "y": y_test}})
        return splits
    
    def get_splits(self):
        return self.splits 

In [22]:
dataset = Dataset("/home/dash/projects/imli/data/datasets/AskUbuntuCorpus.json")
splits = dataset.get_splits()
for split in splits:
    print("X train", split["train"]["X"][: 2])
    print("y train", split["train"]["y"][:2])
    print("X test", split["test"]["X"][: 2])
    print("y test", split["test"]["y"][:2])

X train ['Are there any hardware diagnostic tools?', 'Is there a tool like wifi analyzer for ubuntu?', 'Cannot install printer driver epson l210', 'How to add a network printer on Ubuntu 15.10?', 'What IDEs are available for Ubuntu?']
y train ['Software Recommendation', 'Software Recommendation', 'Setup Printer', 'Setup Printer', 'Software Recommendation']
X test ['How can I shutdown the computer when a certain process ends?', 'What are some good PHP editors?', 'How to shutdown computer when users are logged on?', 'Do I need to reinstall everything to upgrade from 13.10 to 14.10?', 'upgrading to Ubuntu 13.10 from Ubuntu 13.04']
y test ['Shutdown Computer', 'Software Recommendation', 'Shutdown Computer', 'Make Update', 'Make Update']
X train ["What does my computer do when I click 'Shut Down'?", 'Torrent client for the command-line?', 'Is there software that can view .dwg files?', 'How to install a Lexmark z600 series printer?', 'Problem upgrading Ubuntu 9.10']
y train ['Shutdown Comput