In [18]:
import operator
import os, math, sys, re
import string
import numpy as np
import random
import time
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons

# Import model and model helper functions
sys.path.append("..")
import src.fasttext as ft
import src.fasttext_utils as ftu

data_dir = '../data'  
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [19]:
# Initialize the model with the following params
model = torch.load(os.path.join(data_dir, 'model/NN_fasttext_model.pt.bak'))
model.eval()
with open(os.path.join(data_dir, 'model/NN_fasttext_data.pkl'), 'rb') as input:
    TEXT = pickle.load(input)

In [31]:
ekphrasis_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],  # normalize terms
    fix_html=True,  # fix HTML tokens  
    segmenter="english",  # corpus for word segmentation
    corrector="english",  # corpus for spell correction
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # unpack contractions 
    spell_correct_elong=False,  # spell correction for elongated words
    tokenizer=ftu.reg_tokenize,
    dicts=[emoticons]  # replace emojis with words
)

def predict_from_sentence(model, sentence):
    model.eval()
    tokenized = ftu.generate_bigrams([tok for tok in ekphrasis_processor.pre_process_doc(sentence)])
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.round(torch.sigmoid(model(tensor)))
    return prediction.item()


def predict_from_batch(sentence):
    model.eval()
    indexed = [TEXT.vocab.stoi[t] for t in sentence]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.round(torch.sigmoid(model(tensor)))
    return prediction.item()

Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...


In [38]:
predict_from_sentence(model, 'Help')

0.0

In [None]:
def process_applied(start_row):
    chunksize = 100000
    filename = int(start_row / chunksize)
    shuffled_data = os.path.join(data_path, "main_data_shuffled.csv")
    chunk = pd.read_csv(
        shuffled_data, index_col=0, skiprows=start_row, nrows=chunksize,
        names=["index", "id", "score", "body", "label"],
        dtype={"index": np.int64, "id": str, "score": str, "body": str, "label": int},
    )
    chunk["body"] = chunk.body.map(ekphrasis_processor.pre_process_doc)
    chunk.to_csv(
        os.path.join(data_path, "split/" + str(filename).zfill(4) + "_preprocessed_chunk.csv"),
        quoting=csv.QUOTE_NONNUMERIC,
        header=False, index=False
    )

train_len = 229603257
completed = [int(chunk[0:4]) * 100000 for chunk in os.listdir(os.path.join(data_path, 'split/'))] 
all_jobs = list(range(0, train_len, 100000))
jobs = set(all_jobs) - set(completed)
print(jobs)
pool = Pool(cpu_count() - 1)
pool.map(read_iter, jobs)