# Text Classification

### AutoKeras

CodeFest, May 2019

In [1]:
import os
import re
import boto3
import urllib
import zipfile
import tarfile
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.datasets import imdb
from autokeras.utils import read_tsv_file
from autokeras.text.text_supervised import TextClassifier

np.random.seed(21)

Using TensorFlow backend.


/Users/bmcmahon/.pytorch_pretrained_bert
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [None]:
def download_url_to_filepath(outpath, url):
    """Create path and download data from url."""
    fd, fn = re.findall(r"^(.+\/)([^\/]+)$",outpath)[0]
    fp = fd + fn
    if not os.path.exists(fd):
        os.makedirs(fd)
    if not os.path.exists(fp):
        urllib.request.urlretrieve(url, fp) 
    return fp 

def unzip_file(in_path,out_dir):
    if not os.path.exists(out_dir):
        os.makedirs(fd)
        if in_path[-3:]=="zip":
            z = zipfile.ZipFile(in_path,'r')
            z.extractall(out_dir)
            z.close()
        elif in_path[-6:]=="tar.gz":
            tar = tarfile.open(fp)
            tar.extractall(path=out_dir)
            tar.close()    
    return out_dir

def load_data(path):
    """Loads Keras dataset preprocessed into integers by frequency of occurrence, 1 being most frequent
    
    Args
    :path: path/to/keras/dataset
    Returns two tuples of train and test
    """
    (x_train, y_train), (x_test, y_test) = imdb.load_data(path=path)
    return (x_train, y_train), (x_test, y_test)

def convert_labels_to_one_hot(labels, num_labels):
    one_hot = np.zeros((len(labels), num_labels))
    one_hot[np.arange(len(labels)), labels] = 1
    return one_hot

def convert_int_to_word(integer):
    return vocab.values[integer][0]

def convert_int_to_str_array(array):
    return np.array([" ".join(map(str,[convert_int_to_word(n) for n in integer])) for integer in array])

### Load Data

Vocab from [Stanford](http://ai.stanford.edu/~amaas/data/sentiment/)

Preprocessed data from [Keras](https://keras.io/datasets/)

In [None]:
fp = download_url_to_filepath("/tmp/imdb.tar.gz","http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
fd = unzip_file(fp,"/tmp/imdb")
vocab = pd.read_csv(os.path.join(fd,"aclImdb/imdb.vocab"))

In [None]:
print(vocab.shape)
vocab.head()

In [None]:
os.listdir(os.path.join(fd,"aclImdb/train/neg/"))

In [None]:
pd.read_csv(os.path.join(fd,"aclImdb/train/neg/1821_4.txt")).T

In [None]:
File_object = open(os.path.join(fd,"aclImdb/train/neg/1821_4.txt"),"r").readlines()

In [None]:
File_object

In [None]:
def prep_dataframe(in_dir,label):
    lst = []
    for fil in os.listdir(in_dir):
        lst.append(open(os.path.join(in_dir,fil),'r').readlines()[0])
    df = pd.DataFrame(lst,columns=["review"])
    df['sentiment']=label
    return df

def prep_sample(in_dir, sample_type):
    neg = prep_dataframe(os.path.join(in_dir,f"aclImdb/{sample_type}/neg/"),0)
    pos = prep_dataframe(os.path.join(in_dir,f"aclImdb/{sample_type}/pos/"),1)
    df = pd.concat([neg,pos])
    df = df.sample(frac=1)
    X = np.array(df['review'])
    Y = np.array(df['sentiment'])
    return X, Y

In [None]:
x_train, y_train = prep_sample(fd,'train')
x_test, y_test = prep_sample(fd,'test')

In [None]:
# neg = prep_dataframe(os.path.join(fd,"aclImdb/train/neg/"),0)
# pos = prep_dataframe(os.path.join(fd,"aclImdb/train/pos/"),1)
# df = pd.concat([neg,pos])
# df = df.sample(frac=1)
# x_train = np.array(df['review'])
# y_train = np.array(df['sentiment'])

### Load Preprocessed Keras Data

In [None]:
# (x_train, y_train), (x_test, y_test) = load_data(path="imdb.npz")
print(f"X,Y Train: {len(x_train),len(x_train[0])},{len(y_train)}")
print(f"X,Y Test: {len(x_test),len(x_test[0])},{len(y_test)}")

In [None]:
# x_train, y_train = (x_train, y_train)
# x_test, y_test = (x_test, y_test)

In [None]:
# x_train = convert_int_to_str_array(x_train)
# x_test = convert_int_to_str_array(x_test)

In [None]:
y_train = convert_labels_to_one_hot(y_train, num_labels=2)
y_test = convert_labels_to_one_hot(y_test, num_labels=2)

In [None]:
# x_train[:2]

In [None]:
clf = TextClassifier(verbose=True)
clf.fit(x=x_train, y=y_train, time_limit=12 * 60 * 60)
print("Classification accuracy is : ", 100 * clf.evaluate(x_test, y_test), "%")

In [4]:
def download_s3_dir(bucket, dir_key, file_list, out_dir):
    """Downloads a file directory from an s3 bucket path.
    Args
    :bucket: str s3 bucket
    :dir_key: str path from bucket to file, exclusive
    :file_list: list files in directory to download
    :out_dir: path/to/output
    """
    if os.path.exists(out_dir):
        return
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    s3 = boto3.resource("s3")
    for fil in file_list:
        print(f"Downloading {fil}")
        key = dir_key + fil
        fp = out_dir + fil
        s3.Bucket(bucket).download_file(key, fp)
    print(f"Files saved to {out_dir}")

In [5]:
file_list = ["mbbu.pth","vbbu.txt"]
download_s3_dir("nucleus-chc-preprod-datasciences",
                "users/bmcmahon/nas/pytorch_pretrained_bert/",
                file_list,
                os.path.join("/tmp",
                             ".pytorch_pretrained_bert/"))

In [None]:
# import subprocess

# op = os.path.join(os.path.expanduser("~"),".pytorch_pretrained_bert/")
# subprocess.run(["aws","s3","cp","s3://nucleus-chc-preprod-datasciences/users/bmcmahon/nas/pytorch_pretrained_bert",op,"--recursive"])

In [None]:
# download_s3_path("s3://nucleus-chc-preprod-datasciences/users/bmcmahon/nas/pytorch_pretrained_bert",
#                  os.path.join(os.path.expanduser("~"),".pytorch_pretrained_bert/"))

In [None]:
if not os.path.exists(os.path.join(os.path.expanduser("~"),".pytorch_pretrained_bert/")):
    

In [None]:
def get_directory(bucket_name, directory_path, download_path, exclude_file_names):
    # prepare session
    session = Session(aws_access_key_id, aws_secret_access_key, region_name)

    # get instances for resource and bucket
    resource = session.resource('s3')
    bucket = resource.Bucket(bucket_name)

    for s3_key in self.client.list_objects(Bucket=bucket_name, Prefix=directory_path)['Contents']:
        s3_object = s3_key['Key']
        if s3_object not in exclude_file_names:
            bucket.download_file(file_path, download_path + str(s3_object.split('/')[-1])

In [None]:
import os
import boto3

#initiate s3 resource
s3 = boto3.resource('s3')

# select bucket
my_bucket = s3.Bucket('nucleus-chc-prepod-datasciences')

# download file into current directory
for s3_object in my_bucket.objects.all():
    # Need to split s3_object.key into path and file name, else it will give error file not found.
    path, filename = os.path.split(s3_object.key)
    my_bucket.download_file(s3_object.key, filename)