In [28]:
%matplotlib inline
import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os
from google_images_download import google_images_download
import glob
import PIL
import json
import numpy as np

import pickle

import textacy
import en_core_web_sm

from IPython.display import SVG

from IPython.display import Image
 

# Create Training data for text to image model

## Inputs

In [5]:
tblName = "ResourceDocuments"
nodeIdentifierName = "photosynthesis-photosynthesis-photosynthesis-biology"

txtToImage_data_dir = 'data/photosynthesis'
imageLog_fir='logs'

resourceDbName = 'dynamodb'
s3Bucket = "egm-bucket/TEXT_TO_IMAGE_DATA/data"

### Get Data from Resource Db: 
photosynthesis whole

In [6]:
# Get Definitions for photosynthesis from dynamodb

## Connect to dynamodb
dynamodbClient = boto3.resource("dynamodb")
# client = boto3.client('dynamodb')
# display(client.describe_table(TableName=tblName))

## Connect to table with resources
resourceTbl = dynamodbClient.Table(tblName)
# display(resourceTbl.global_secondary_indexes)
display("NUmber of Items in ResourceDb: {}".format(resourceTbl.item_count))

'NUmber of Items in ResourceDb: 14'

### Load text into pandas 
- For data munging
    - stats
    - Duplicates

In [7]:
response = resourceTbl.query(
    IndexName='NODE_IDENTIFIER-index',
    KeyConditionExpression=Key('NODE_IDENTIFIER').eq(nodeIdentifierName)
)

# Pass through pandas for some data munging
rsrc_df = pd.DataFrame(response["Items"])
print("Db Response Shape: {}".format(rsrc_df.shape))

rsrc_df.drop_duplicates(['RESOURCE'], keep='last', inplace=True)
rsrc_df.reset_index(drop=True)

print("Db Response Shape: {}".format(rsrc_df.shape))
print(rsrc_df.columns)

Db Response Shape: (14, 13)
Db Response Shape: (7, 13)
Index(['IMAGES', 'NODE_IDENTIFIER', 'POS', 'RESOURCE', 'RESOURCE_ATTRIBUTION',
       'RESOURCE_DATATYPE', 'RESOURCE_SOURCE', 'RESOURCE_TYPE', 'RESOURCE_URL',
       'TERM', 'TIME_DOWNLOADED', 'TOPIC', 'UNIQUE_IDENTIFIER'],
      dtype='object')


In [8]:
rsrc_df["RESOURCE_SOURCE"]

4        century
5        wordnet
9     wiktionary
10     wikipedia
11         gcide
12     wikipedia
13    ahd-legacy
Name: RESOURCE_SOURCE, dtype: object

### Create Labels for text to image model

In [9]:
def handle_missing_directories(directory_flpth):
    # Handle missing Directory
    if not os.path.exists(directory_flpth):
        
        os.makedirs(directory_flpth)
        print("Made new directory: {}".format(directory_flpth))
        # print(os.path.join(dirname, flpth))
    else:
        pass
    
    return

# Create text file for each doc - Each Doc maps to an image

## TODO: incoroporate number of labals per line
def labels_to_imageTxt_files(rsrc_df, trainingData_term, trainigData_flpth='../data'):
    
    # Handle if a data directory for a term exists e.g. data/photosynthesis
    dirname = os.path.abspath('')
    termData_flpth = os.path.join(dirname, trainigData_flpth)
    handle_missing_directories(termData_flpth)
    
    
    ### Move resource df to textacy
   
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

    # Load english model
    en = en_core_web_sm.load()
    labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
    
    caption_filename_path = os.path.join(trainigData_flpth, "captions.pickle")
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(labels_corpus):
        print("Number of Sentenses: {}".format(doc.n_sents))
        
        # Paths to directories (Where to write the text files)
        filename = "{}_{}.txt".format(trainingData_term, ix)
        path_to_file = "{}/{}".format(trainigData_flpth, filename)
        
        # Write captions for google images
        f =  open(path_to_file, 'w')
        
        # Parse Document into sentences
        for sent in doc.sents:
            caption = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            # f.write(label+"\n" )
            f.write(caption+" " )
            
        f.close()
        
    return ix + 1 # Count using 1 as start

# process labels for images


trainingData_term = 'photosynthesis'
txt_trainingData_flpth='{}/text'.format(txtToImage_data_dir, 'text')

numText_files = labels_to_imageTxt_files(rsrc_df, trainingData_term, txt_trainingData_flpth)
numText_files

Number of Sentenses: 1
Number of Sentenses: 1
Number of Sentenses: 1
Number of Sentenses: 16
Number of Sentenses: 9
Number of Sentenses: 5
Number of Sentenses: 2


7

### Write a text file for all of the caption filenames

In [13]:
# Make a text file with a list of the caption filenames

text_flpth = os.path.join(txtToImage_data_dir, 'text')
print("Text Data Directory: {}\n".format(text_flpth))

# Name of file with all the caption file names 

write_filename = "{}.txt".format('filenames')
filenames_flpth = os.path.join(txtToImage_data_dir, write_filename)


# # Make a new text 
# handle_missing_directories(text_flpth)

f =  open(filenames_flpth, 'w')

# Find all the files with captions in the text directory and write there names to a file
for path, subdirs, files in os.walk(text_flpth):
    for name in files:
        
        caption_flpth = os.path.join(path, name)
        
        f.write("{}\n".format(caption_flpth))
        print (caption_flpth)
    
f.close()



Text Data Directory: data/photosynthesis/text

data/photosynthesis/text/photosynthesis_4.txt
data/photosynthesis/text/photosynthesis_0.txt
data/photosynthesis/text/photosynthesis_5.txt
data/photosynthesis/text/photosynthesis_6.txt
data/photosynthesis/text/photosynthesis_3.txt
data/photosynthesis/text/photosynthesis_1.txt
data/photosynthesis/text/photosynthesis_2.txt
data/photosynthesis/text/sun/sun_01.txt


In [26]:
def filenames_to_pickle(objTopickle, flpth):
    
    
    
    return
    
def splitData(trainSplit, testSplit, filename_lst):
    
    # Calculate total number of filenames
    num_filenames = len(filename_lst)

    numTrain_files = np.ceil(trainSplit * num_filenames).astype(int)
    numTest_files = np.floor(testSplit * num_filenames).astype(int)
    
    print("Number of Train files: {}".format(numTrain_files))
    print("Number of Test files: {}".format(numTest_files))
    
    trainFile_lst = filename_lst[:numTrain_files]
    testFile_lst = filename_lst[-numTrain_files:]
    
    return {
        "train": trainFile_lst,
        "test": testFile_lst
    }



In [27]:

# Text data directories
text_flpth = os.path.join(txtToImage_data_dir, 'text')
print("Text Data Directory: {}\n".format(text_flpth))

# Get all the file names
captions_filename_lst = glob.glob(text_flpth+"/**/*.txt", recursive=True)


display(captions_filename_lst)
print("Number of files: {}".format(num_filenames))

# Split the data into training and test
## Will need to accomodate term weightings and try different cossvalidation methods
trainSplit = 0.7
testSplit = 1

split_dict = splitData(trainSplit, testSplit, captions_filename_lst)

# write files names to text and pickle


Text Data Directory: data/photosynthesis/text



['data/photosynthesis/text/photosynthesis_4.txt',
 'data/photosynthesis/text/photosynthesis_0.txt',
 'data/photosynthesis/text/photosynthesis_5.txt',
 'data/photosynthesis/text/photosynthesis_6.txt',
 'data/photosynthesis/text/photosynthesis_3.txt',
 'data/photosynthesis/text/photosynthesis_1.txt',
 'data/photosynthesis/text/photosynthesis_2.txt',
 'data/photosynthesis/text/sun/sun_01.txt']

Number of files: 8
Number of Train files: 6
Number of Test files: 8


{'train': ['data/photosynthesis/text/photosynthesis_4.txt',
  'data/photosynthesis/text/photosynthesis_0.txt',
  'data/photosynthesis/text/photosynthesis_5.txt',
  'data/photosynthesis/text/photosynthesis_6.txt',
  'data/photosynthesis/text/photosynthesis_3.txt',
  'data/photosynthesis/text/photosynthesis_1.txt'],
 'test': ['data/photosynthesis/text/photosynthesis_5.txt',
  'data/photosynthesis/text/photosynthesis_6.txt',
  'data/photosynthesis/text/photosynthesis_3.txt',
  'data/photosynthesis/text/photosynthesis_1.txt',
  'data/photosynthesis/text/photosynthesis_2.txt',
  'data/photosynthesis/text/sun/sun_01.txt']}

In [23]:



with open('outfile', 'wb') as fp:
    pickle.dump(itemlist, fp)

['data/photosynthesis/text/photosynthesis_1.txt',
 'data/photosynthesis/text/photosynthesis_2.txt',
 'data/photosynthesis/text/sun/sun_01.txt']

In [48]:
glob.glob1(text_flpth,"*.txt")

['photosynthesis_4.txt',
 'photosynthesis_0.txt',
 'photosynthesis_5.txt',
 'photosynthesis_6.txt',
 'photosynthesis_3.txt',
 'photosynthesis_1.txt',
 'photosynthesis_2.txt']

#### Split the data into training and test and 
Notes:
    - I think in the original AttnGAN code `test` means `cross-validation`

### Download Images from google 

In [None]:

def download_images(term ,img_args):
    
    # Download Images 
    response = google_images_download.googleimagesdownload()
    img_paths = response.download(img_args)
    
    # Post Process google image results
    for idx, f in enumerate(img_paths[term]):
        
        # Open Google image resulst and conver to jpeg
        img = PIL.Image.open(f)
        img_filetype = img.format.lower()  # 'JPEG'
        
        rgb_img = img.convert('RGB')
        img.close()
        
        # Make new filenme to allign with text file name
        filename = "{}_{}.{}".format(trainingData_term, idx, 'jpg')
        newfilepath_f = os.path.join(os.path.dirname(f), filename)
        
        # Save and image
        rgb_img.save(newfilepath_f)
        os.remove(f)
    
    
    return response 

img_args = {"keywords":"photosynthesis",
             "format": "png",
              "limit": numText_files,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "photosynthesis/images",
            "no_download": False,
            "extract_metadata":True
            # "size":"icon"
           }

response = download_images(trainingData_term, img_args)

In [7]:
im = PIL.Image.open("../AttnGAN/data/birds/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0002_55.jpg")
width, height = im.size
print(width)
print(height)

500
347


In [None]:
### Prepare 

In [17]:

data_dir = txtToImage_data_dir 

filepath = os.path.join(data_dir, 'captions.pickle')
        
# train_names = self.load_filenames(data_dir, 'train')
# logging.debug("train_names: {}".format(train_names))


# test_names = self.load_filenames(data_dir, 'test')
# logging.debug("test_names: {}".format(test_names))

# logging.debug("Check if captions.pickle exists")
if not os.path.isfile(filepath):
    print("captions filepath: {}".format(filepath))
else:
    print("Cant find captions picke")

captions filepath: data/photosynthesis/captions.pickle


In [16]:
os.path.isfile(filepath)

False

In [13]:
filepath

'data/photosynthesis/captions.pickle'

In [None]:
from __future__ import print_function

import miscc.utils.mkdir_p as mkdir_p
from miscc.utils import build_super_images
from miscc.losses import sent_loss, words_loss
from miscc.config import cfg, cfg_from_file

from datasets import TextDataset
from datasets import prepare_data

from model import RNN_ENCODER, CNN_ENCODER

In [None]:
import os
import sys
import time
import random
import pprint
import datetime
import dateutil.tz
import argparse
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms

In [None]:
dir_path = (os.path.abspath(os.path.join(os.path.realpath(__file__), './.')))
sys.path.append(dir_path)


UPDATE_INTERVAL = 200
def parse_args():
    parser = argparse.ArgumentParser(description='Train a DAMSM network')
    parser.add_argument('--cfg', dest='cfg_file',
                        help='optional config file',
                        default='cfg/DAMSM/bird.yml', type=str)
    parser.add_argument('--gpu', dest='gpu_id', type=int, default=0)
    parser.add_argument('--data_dir', dest='data_dir', type=str, default='')
    parser.add_argument('--manualSeed', type=int, help='manual seed')
    args = parser.parse_args()
    return args


def train(dataloader, cnn_model, rnn_model, batch_size,
          labels, optimizer, epoch, ixtoword, image_dir):
    cnn_model.train()
    rnn_model.train()
    s_total_loss0 = 0
    s_total_loss1 = 0
    w_total_loss0 = 0
    w_total_loss1 = 0
    count = (epoch + 1) * len(dataloader)
    start_time = time.time()
    for step, data in enumerate(dataloader, 0):
        # print('step', step)
        rnn_model.zero_grad()
        cnn_model.zero_grad()

        imgs, captions, cap_lens, \
            class_ids, keys = prepare_data(data)


        # words_features: batch_size x nef x 17 x 17
        # sent_code: batch_size x nef
        words_features, sent_code = cnn_model(imgs[-1])
        # --> batch_size x nef x 17*17
        nef, att_sze = words_features.size(1), words_features.size(2)
        # words_features = words_features.view(batch_size, nef, -1)

        hidden = rnn_model.init_hidden(batch_size)
        # words_emb: batch_size x nef x seq_len
        # sent_emb: batch_size x nef
        words_emb, sent_emb = rnn_model(captions, cap_lens, hidden)

        w_loss0, w_loss1, attn_maps = words_loss(words_features, words_emb, labels,
                                                 cap_lens, class_ids, batch_size)
        w_total_loss0 += w_loss0.data
        w_total_loss1 += w_loss1.data
        loss = w_loss0 + w_loss1

        s_loss0, s_loss1 = \
            sent_loss(sent_code, sent_emb, labels, class_ids, batch_size)
        loss += s_loss0 + s_loss1
        s_total_loss0 += s_loss0.data
        s_total_loss1 += s_loss1.data
        #
        loss.backward()
        #
        # `clip_grad_norm` helps prevent
        # the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(rnn_model.parameters(),
                                      cfg.TRAIN.RNN_GRAD_CLIP)
        optimizer.step()

        if step % UPDATE_INTERVAL == 0:
            count = epoch * len(dataloader) + step

            s_cur_loss0 = s_total_loss0[0] / UPDATE_INTERVAL
            s_cur_loss1 = s_total_loss1[0] / UPDATE_INTERVAL

            w_cur_loss0 = w_total_loss0[0] / UPDATE_INTERVAL
            w_cur_loss1 = w_total_loss1[0] / UPDATE_INTERVAL

            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                  's_loss {:5.2f} {:5.2f} | '
                  'w_loss {:5.2f} {:5.2f}'
                  .format(epoch, step, len(dataloader),
                          elapsed * 1000. / UPDATE_INTERVAL,
                          s_cur_loss0, s_cur_loss1,
                          w_cur_loss0, w_cur_loss1))
            s_total_loss0 = 0
            s_total_loss1 = 0
            w_total_loss0 = 0
            w_total_loss1 = 0
            start_time = time.time()
            # attention Maps
            img_set, _ = \
                build_super_images(imgs[-1].cpu(), captions,
                                   ixtoword, attn_maps, att_sze)
            if img_set is not None:
                im = Image.fromarray(img_set)
                fullpath = '%s/attention_maps%d.png' % (image_dir, step)
                im.save(fullpath)
    return count


def evaluate(dataloader, cnn_model, rnn_model, batch_size):
    cnn_model.eval()
    rnn_model.eval()
    s_total_loss = 0
    w_total_loss = 0
    for step, data in enumerate(dataloader, 0):
        real_imgs, captions, cap_lens, \
                class_ids, keys = prepare_data(data)

        words_features, sent_code = cnn_model(real_imgs[-1])
        # nef = words_features.size(1)
        # words_features = words_features.view(batch_size, nef, -1)

        hidden = rnn_model.init_hidden(batch_size)
        words_emb, sent_emb = rnn_model(captions, cap_lens, hidden)

        w_loss0, w_loss1, attn = words_loss(words_features, words_emb, labels,
                                            cap_lens, class_ids, batch_size)
        w_total_loss += (w_loss0 + w_loss1).data

        s_loss0, s_loss1 = \
            sent_loss(sent_code, sent_emb, labels, class_ids, batch_size)
        s_total_loss += (s_loss0 + s_loss1).data

        if step == 50:
            break

    s_cur_loss = s_total_loss[0] / step
    w_cur_loss = w_total_loss[0] / step

    return s_cur_loss, w_cur_loss


def build_models():
    # build model ############################################################
    text_encoder = RNN_ENCODER(dataset.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM)
    image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM)
    labels = Variable(torch.LongTensor(range(batch_size)))
    start_epoch = 0
    if cfg.TRAIN.NET_E != '':
        state_dict = torch.load(cfg.TRAIN.NET_E)
        text_encoder.load_state_dict(state_dict)
        print('Load ', cfg.TRAIN.NET_E)
        #
        name = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder')
        state_dict = torch.load(name)
        image_encoder.load_state_dict(state_dict)
        print('Load ', name)

        istart = cfg.TRAIN.NET_E.rfind('_') + 8
        iend = cfg.TRAIN.NET_E.rfind('.')
        start_epoch = cfg.TRAIN.NET_E[istart:iend]
        start_epoch = int(start_epoch) + 1
        print('start_epoch', start_epoch)
    if cfg.CUDA:
        text_encoder = text_encoder.cuda()
        image_encoder = image_encoder.cuda()
        labels = labels.cuda()

    return text_encoder, image_encoder, labels, start_epoch


if __name__ == "__main__":
    args = parse_args()
    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)

    if args.gpu_id == -1:
        cfg.CUDA = False
    else:
        cfg.GPU_ID = args.gpu_id

    if args.data_dir != '':
        cfg.DATA_DIR = args.data_dir
    print('Using config:')
    pprint.pprint(cfg)

    if not cfg.TRAIN.FLAG:
        args.manualSeed = 100
    elif args.manualSeed is None:
        args.manualSeed = random.randint(1, 10000)
    random.seed(args.manualSeed)
    np.random.seed(args.manualSeed)
    torch.manual_seed(args.manualSeed)
    if cfg.CUDA:
        torch.cuda.manual_seed_all(args.manualSeed)

    ##########################################################################
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
    output_dir = '../output/%s_%s_%s' % \
        (cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)

    model_dir = os.path.join(output_dir, 'Model')
    image_dir = os.path.join(output_dir, 'Image')
    mkdir_p(model_dir)
    mkdir_p(image_dir)

    torch.cuda.set_device(cfg.GPU_ID)
    cudnn.benchmark = True

    # Get data loader ##################################################
    imsize = cfg.TREE.BASE_SIZE * (2 ** (cfg.TREE.BRANCH_NUM-1))
    batch_size = cfg.TRAIN.BATCH_SIZE
    image_transform = transforms.Compose([
        transforms.Resize(int(imsize * 76 / 64)),
        transforms.RandomCrop(imsize),
        transforms.RandomHorizontalFlip()])
    dataset = TextDataset(cfg.DATA_DIR, 'train',
                          base_size=cfg.TREE.BASE_SIZE,
                          transform=image_transform)

    print(dataset.n_words, dataset.embeddings_num)
    assert dataset
    dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, drop_last=True,
        shuffle=True, num_workers=int(cfg.WORKERS))

    # # validation data #
    dataset_val = TextDataset(cfg.DATA_DIR, 'test',
                              base_size=cfg.TREE.BASE_SIZE,
                              transform=image_transform)
    dataloader_val = torch.utils.data.DataLoader(
        dataset_val, batch_size=batch_size, drop_last=True,
        shuffle=True, num_workers=int(cfg.WORKERS))

    # Train ##############################################################
    text_encoder, image_encoder, labels, start_epoch = build_models()
    para = list(text_encoder.parameters())
    for v in image_encoder.parameters():
        if v.requires_grad:
            para.append(v)
    # optimizer = optim.Adam(para, lr=cfg.TRAIN.ENCODER_LR, betas=(0.5, 0.999))
    # At any point you can hit Ctrl + C to break out of training early.
    try:
        lr = cfg.TRAIN.ENCODER_LR
        for epoch in range(start_epoch, cfg.TRAIN.MAX_EPOCH):
            optimizer = optim.Adam(para, lr=lr, betas=(0.5, 0.999))
            epoch_start_time = time.time()
            count = train(dataloader, image_encoder, text_encoder,
                          batch_size, labels, optimizer, epoch,
                          dataset.ixtoword, image_dir)
            print('-' * 89)
            if len(dataloader_val) > 0:
                s_loss, w_loss = evaluate(dataloader_val, image_encoder,
                                          text_encoder, batch_size)
                print('| end epoch {:3d} | valid loss '
                      '{:5.2f} {:5.2f} | lr {:.5f}|'
                      .format(epoch, s_loss, w_loss, lr))
            print('-' * 89)
            if lr > cfg.TRAIN.ENCODER_LR/10.:
                lr *= 0.98

            if (epoch % cfg.TRAIN.SNAPSHOT_INTERVAL == 0 or
                epoch == cfg.TRAIN.MAX_EPOCH):
                torch.save(image_encoder.state_dict(),
                           '%s/image_encoder%d.pth' % (model_dir, epoch))
                torch.save(text_encoder.state_dict(),
                           '%s/text_encoder%d.pth' % (model_dir, epoch))
                print('Save G/Ds models.')
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

In [None]:
os.path.dirname("/home/ec2-user/environment/AttnGAN/data/photosynthesis/images/5. photosynthesis.png")

In [None]:
 
s3Client = boto3.client("s3")     
s3Client.Object('my-bucket-name', 'newfile.txt').put(Body=content)

In [None]:
list(labels_corpus.docs[0].sents)

In [None]:
import requests
r = requests.get("https://ssec.si.edu/stemvisions-blog/what-photosynthesis")

In [None]:
r.content