In [5]:
from __future__ import print_function

from collections import Counter
import itertools
import numpy as np
import tensorflow as tf
import re
import os
from keras.utils.np_utils import to_categorical

In [12]:
def load_data_and_labels_from_many_files(data_folder, data_files):
    """
    Loads sentences from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    print("Loading data...")
    x_text = []
    y = []

    for i, data_file in enumerate(data_files):

        sentences = list(open(data_folder + "/" + data_file, "r").readlines())
        sentences = [s.strip() for s in sentences]
        # Split by words
        # sentences = [clean_str(s) for s in sentences]
        sentences = [s.split() for s in sentences]
        x_text += sentences
        # Labels as numbers
        labels = [i for s in sentences]
        y += labels

    # Generate one-hot labels
    y = to_categorical(y, num_classes=len(data_files))

    return x_text, y

def load_data_and_labels_from_one_file(data_folder, data_file):
    """
    Loads sentences from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    print("Loading data...")
    x_text = []
    y = []

    lines = list(open(data_folder + "/" + data_file, "r").readlines())
    print(lines)
    print(l.split("\t") for l in lines)
    sentences, labels = [l.split("\t") for l in lines]
    print(sentences)
    print(labels)
    # Split by words
    # sentences = [clean_str(s) for s in sentences]
    sentences = [s.strip() for s in sentences]
    x_text += sentences
    # Labels as numbers
    y += labels

    # Generate one-hot labels
    y = to_categorical(y, num_classes=len(set(y)))

    return x_text, y


def pad_sentences(sentences, padding_word=""):
    """
    Pads all sentences to be the length of the longest sentence.
    Returns padded sentences.
    """
    print("Padding sentences...")
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
        
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from token to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    print("Building word vocabulary...")
    word_counts = Counter(itertools.chain(*sentences))
    
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    
    return vocabulary, vocabulary_inv


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    print("Converting to ids...")
    x = np.array([
            [vocabulary[word] for word in sentence]
            for sentence in sentences])
    y = np.array(labels)
    
    return x, y


In [13]:
data_folder = "../data/vardial2017-sample"
data_files = ["EGY", "GLF", "LAV", "MSA", "NOR"]
sentences, labels = load_data_and_labels_from_many_files(data_folder, data_files)
print(len(sentences))
print(sentences[0])

Loading data...
50
['tthdm', 'AlmsAjd', 'fy', "synA'", 'wAl>bAt$y', 'sxryp', 'mn', 'Alhjrp', 'Alnbwyp', 'fy', 'Aljrydp', 'Alrsmyp', 'lmA', 'mAdty', 'Altrbyp', 'Al<slAmyp', 'mn', 'AlmdArs']


In [14]:
data_folder = "../data/vardial2018-sample"
train_file = "train.words"
dev_file = "dev.words"
# Step 1: Read in data
sentences_train, labels_train = load_data_and_labels_from_one_file(data_folder, train_file)
sentences_dev, labels_dev = load_data_and_labels_from_one_file(data_folder, train_file)
sentences = sentences_train + sentences_dev
labels = labels_train + labels_dev
print(len(sentences))
print(sentences[0])

Loading data...
['AlkAmyrwn AlkAmlp\tGLF\n', '>ryd >n >sjl b>n AljmyE wAfq ElY h*A AlqrAr lmSlHp AlErAq\tMSA\n', "mA fy$ mA fy$ kfAyp yEny kyf mmkn yEny Alywm >nt $Ayf mwqf AlqDAp wmjls AlqDA' Al>ElY mvlA lA nwAfq ElY Al<$rAf nAdy AlqDAp lw mwqf mtHfZ hnAk <$kAlyp Hwl h*A Aldwr wHwl mwqf AlqDAp Emlyp AlAstftA' ElY Aldstwr\tGLF\n", 'tnTlq vAnwyp\tGLF\n', 'mA fy >y wAHd mn wSlwA <lY mrHlp AltElym yEny >kvr w<HnA nwASl AlmrHlp Al<EdAdyp\tGLF\n', 'k$f llm$Akl Aldynyp wAlvqAfyp kAml kAn yEy$ AltDArbAt yEny mn nAHyp mn nAHyp mn nAHyp AlxdmAt mn nAHyp HyAthA\tNOR\n', 'dktwr Ez Aldyn fy AlqAhrp br>yk mA Al*y syxtlf bynmA kAn Elyh AltEAwn >w AltEATy Al>myrky AlmSry fy Ehd mbArk wmA hw Elyh Al|n mA Al*y sybqY wmA Al*y syxtlf\tMSA\n', '<n h*A TbEA\tGLF\n', "fy AlmTAlb Alm$rwEp bAlnsbp lkm lkn rAEwA <n fy nAs brDh m$ EAyz >qwl bywthA ElY w$k >nhA tqf Alty qAlt twqf Aln$AT AlryADy lkn >qwl lhm rEb Al$hdA' m$ HnnsY Ally HSl m$ HnnsY swA' lEybp swA' mjls <dArp swA' jmhwr swA' wkl HAjp lkn ll>ltrAs lk

ValueError: too many values to unpack (expected 2)