In [39]:
%matplotlib inline
import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os
from google_images_download import google_images_download
import glob
import PIL
import json
import numpy as np
import pathlib
import pickle

import textacy
import en_core_web_sm

from IPython.display import SVG

from IPython.display import Image
 

# Create Training data for text to image model

## Inputs

In [2]:
tblName = "ResourceDocuments"
nodeIdentifierName = "photosynthesis-photosynthesis-photosynthesis-biology"

termTxtToImage_flpth = 'data/photosynthesis'
imageLog_fir='logs'

resourceDbName = 'dynamodb'
s3Bucket = "egm-bucket/TEXT_TO_IMAGE_DATA/data"

### Get Data from Resource Db: 
photosynthesis whole

In [3]:
# Get Definitions for photosynthesis from dynamodb

## Connect to dynamodb
dynamodbClient = boto3.resource("dynamodb")
# client = boto3.client('dynamodb')
# display(client.describe_table(TableName=tblName))

## Connect to table with resources
resourceTbl = dynamodbClient.Table(tblName)
# display(resourceTbl.global_secondary_indexes)
display("NUmber of Items in ResourceDb: {}".format(resourceTbl.item_count))

'NUmber of Items in ResourceDb: 14'

### Load text into pandas 
- For data munging
    - stats
    - Duplicates

In [4]:
response = resourceTbl.query(
    IndexName='NODE_IDENTIFIER-index',
    KeyConditionExpression=Key('NODE_IDENTIFIER').eq(nodeIdentifierName)
)

# Pass through pandas for some data munging
rsrc_df = pd.DataFrame(response["Items"])
print("Db Response Shape: {}".format(rsrc_df.shape))

rsrc_df.drop_duplicates(['RESOURCE'], keep='last', inplace=True)
rsrc_df.reset_index(drop=True)

print("Db Response Shape: {}".format(rsrc_df.shape))
print(rsrc_df.columns)

Db Response Shape: (14, 13)
Db Response Shape: (7, 13)
Index(['IMAGES', 'NODE_IDENTIFIER', 'POS', 'RESOURCE', 'RESOURCE_ATTRIBUTION',
       'RESOURCE_DATATYPE', 'RESOURCE_SOURCE', 'RESOURCE_TYPE', 'RESOURCE_URL',
       'TERM', 'TIME_DOWNLOADED', 'TOPIC', 'UNIQUE_IDENTIFIER'],
      dtype='object')


In [5]:
rsrc_df["RESOURCE_SOURCE"]

4        century
5        wordnet
9     wiktionary
10     wikipedia
11         gcide
12     wikipedia
13    ahd-legacy
Name: RESOURCE_SOURCE, dtype: object

### Create Labels for text to image model

In [8]:
def handle_missing_directories(directory_flpth):
    # Handle missing Directory
    if not os.path.exists(directory_flpth):
        
        os.makedirs(directory_flpth)
        print("Made new directory: {}".format(directory_flpth))
        # print(os.path.join(dirname, flpth))
    else:
        pass
    
    return

# Create text file for each doc - Each Doc maps to an image

## TODO: incoroporate number of labals per line
def labels_to_imageTxt_files(rsrc_df, trainingData_term, trainigData_flpth='../data'):
    
    # Handle if a data directory for a term exists e.g. data/photosynthesis
    dirname = os.path.abspath('')
    termData_flpth = os.path.join(dirname, trainigData_flpth)
    handle_missing_directories(termData_flpth)
    
    
    ### Move resource df to textacy
   
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

    # Load english model
    en = en_core_web_sm.load()
    labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
    
    caption_filename_path = os.path.join(trainigData_flpth, "captions.pickle")
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(labels_corpus):
        print("Number of Sentenses: {}".format(doc.n_sents))
        
        # Paths to directories (Where to write the text files)
        filename = "{}_{}.txt".format(trainingData_term, ix)
        path_to_file = "{}/{}".format(trainigData_flpth, filename)
        
        # Write captions for google images
        f =  open(path_to_file, 'w')
        
        # Parse Document into sentences
        for sent in doc.sents:
            caption = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            # f.write(label+"\n" )
            f.write(caption+" " )
            
        f.close()
        
    return ix + 1 # Count using 1 as start

# process labels for images


trainingData_term = 'photosynthesis'
txt_trainingData_flpth='{}/text'.format(termTxtToImage_flpth, 'text')

numText_files = labels_to_imageTxt_files(rsrc_df, trainingData_term, txt_trainingData_flpth)
numText_files

Number of Sentenses: 1
Number of Sentenses: 1
Number of Sentenses: 1
Number of Sentenses: 16
Number of Sentenses: 9
Number of Sentenses: 5
Number of Sentenses: 2


7

### Download Images from google 

In [9]:

def download_images(term ,img_args):
    
    # Download Images 
    response = google_images_download.googleimagesdownload()
    img_paths = response.download(img_args)
    
    # Post Process google image results
    for idx, f in enumerate(img_paths[term]):
        
        # Open Google image resulst and conver to jpeg
        img = PIL.Image.open(f)
        img_filetype = img.format.lower()  # 'JPEG'
        
        rgb_img = img.convert('RGB')
        img.close()
        
        # Make new filenme to allign with text file name
        filename = "{}_{}.{}".format(trainingData_term, idx, 'jpg')
        newfilepath_f = os.path.join(os.path.dirname(f), filename)
        
        # Save and image
        rgb_img.save(newfilepath_f)
        os.remove(f)
    
    
    return response 

img_args = {"keywords":"photosynthesis",
             "format": "png",
              "limit": numText_files,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "photosynthesis/images",
            "no_download": False,
            "extract_metadata":True
            # "size":"icon"
           }

response = download_images(trainingData_term, img_args)


Item no.: 1 --> Item name = photosynthesis
Evaluating...
Starting Download...

Image Metadata: {'image_format': 'png', 'image_height': 220, 'image_width': 220, 'image_link': 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/55/Photosynthesis_en.svg/220px-Photosynthesis_en.svg.png', 'image_description': 'Photosynthesis - Wikipedia', 'image_host': 'en.wikipedia.org', 'image_source': 'https://en.wikipedia.org/wiki/Photosynthesis', 'image_thumbnail_url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS5KkJVlZz81zgJD2edw3thRhA_3dLLlsBtIpyz3qStyfqmoyI6'}
Completed Image ====> 1. 220px-photosynthesis_en.svg.png

Image Metadata: {'image_format': 'png', 'image_height': 421, 'image_width': 575, 'image_link': 'https://ssec.si.edu/sites/default/files/Photosynthesis_0.png', 'image_description': 'What is Photosynthesis | Smithsonian Science Education Center', 'image_host': 'ssec.si.edu', 'image_source': 'https://ssec.si.edu/stemvisions-blog/what-photosynthesis', 'image_thumbnail_url':

In [17]:
def txtFilenamesTo_pickle(flpth, lst_to_write):
    
    handle_missing_directories(os.path.dirname(flpth))
    
    pickle_out  = open(flpth, "wb")
    pickle.dump(split_dict[splitType], pickle_out)
    pickle_out.close()

    return 
    
def splitData(trainSplit, testSplit, filename_lst):
    
    # Calculate total number of filenames
    num_filenames = len(filename_lst)

    numTrain_files = np.ceil(trainSplit * num_filenames).astype(int)
    numTest_files = np.floor(testSplit * num_filenames).astype(int)
    
    print("Number of Train files: {}".format(numTrain_files))
    print("Number of Test files: {}".format(numTest_files))
    
    trainFile_lst = filename_lst[:numTrain_files]
    testFile_lst = filename_lst[-numTrain_files:]
    
    return {
        "train": trainFile_lst,
        "test": testFile_lst
    }



In [80]:
def modified_txt_flpth(full_flpth):
    # Read in filepath and seperate
    p = pathlib.Path(full_flpth)
    txtRel_flpth = p.relative_to(termTxtToImage_flpth)
    txtCaption_flpth = str(txtRel_flpth.parent)
    
    txtFile_nm = txtRel_flpth.stem
    #print("Filename: {}".format(txtFile_nm))    
    
    
    caption_flpth = os.path.join(txtCaption_flpth, txtFile_nm)
    
    return caption_flpth
captionFilename = modified_txt_flpth('data/photosynthesis/text/photosynthesis_4.txt')
captionFilename

'text/photosynthesis_4'

### Split Data for model training

In [81]:
# Define Text data directories
text_flpth = os.path.join(termTxtToImage_flpth, 'text')
print("Text Data Directory: {}\n".format(text_flpth))

# Get all the file names from the text directory
captionsFilename_lst = glob.glob(text_flpth+"/**/*.txt", recursive=True)
modifiedCaptionsFilename_lst = [modified_txt_flpth(relCaption_flpth)  for relCaption_flpth in captionsFilename_lst]

display(captions_filename_lst)
print("Number of files: {}".format(len(modifiedCaptionsFilename_lst)))


# Split the data into training and test
## Will need to accomodate term weightings and try different cossvalidation methods
trainSplit = 0.7
testSplit = 0.3

split_dict = splitData(trainSplit, testSplit, modifiedCaptionsFilename_lst)

# write files names to text and pickle
for splitType in split_dict:
    txtFlpth_pickle = os.path.join(termTxtToImage_flpth , splitType, 'filenames.pickle')
    print('Pickling: {} '.format(txtFlpth_pickle))

    txtFilenamesTo_pickle(txtFlpth_pickle, split_dict[splitType])

Text Data Directory: data/photosynthesis/text



['data/photosynthesis/text/photosynthesis_4.txt',
 'data/photosynthesis/text/photosynthesis_0.txt',
 'data/photosynthesis/text/photosynthesis_5.txt',
 'data/photosynthesis/text/photosynthesis_6.txt',
 'data/photosynthesis/text/photosynthesis_3.txt',
 'data/photosynthesis/text/photosynthesis_1.txt',
 'data/photosynthesis/text/photosynthesis_2.txt']

Number of files: 7
Number of Train files: 5
Number of Test files: 2
Pickling: data/photosynthesis/train/filenames.pickle 
Pickling: data/photosynthesis/test/filenames.pickle 


#### Split the data into training and test and 
Notes:
    - I think in the original AttnGAN code `test` means `cross-validation`

### Write a text file for all of the caption filenames

In [24]:
# Get all the file names from the text directory
captions_filename_lst = glob.glob(text_flpth+"/**/*.txt", recursive=True)

In [25]:
captions_filename_lst

['data/photosynthesis/text/photosynthesis_4.txt',
 'data/photosynthesis/text/photosynthesis_0.txt',
 'data/photosynthesis/text/photosynthesis_5.txt',
 'data/photosynthesis/text/photosynthesis_6.txt',
 'data/photosynthesis/text/photosynthesis_3.txt',
 'data/photosynthesis/text/photosynthesis_1.txt',
 'data/photosynthesis/text/photosynthesis_2.txt']

In [78]:
# Make a text file with a list of the caption filenames

text_flpth = os.path.join(termTxtToImage_flpth, 'text')
print("Text Data Directory: {}\n".format(text_flpth))

# Find all the files with captions in the text directory and write there names to a file
txtFile_lst = glob.glob(text_flpth+"/**/*.txt", recursive=True)

writeFilename = "{}.txt".format('filenames')
writeFilenames_flpth = os.path.join(termTxtToImage_flpth, writeFilename)


# Write list to file in a form ATTN GAN accepts
f =  open(writeFilenames_flpth, 'w')

for item in txtFile_lst:
    
    
    caption_flpth = modified_txt_flpth(item) 
        
    f.write("{}\n".format(caption_flpth))
    print (caption_flpth)
    
f.close()



Text Data Directory: data/photosynthesis/text

text
text
text
text
text
text
text


In [57]:


pickle.load(termTxtToImage_flpth+"captions")


TypeError: file must have 'read' and 'readline' attributes

In [85]:
p.stem

'photosynthesis_2'

In [51]:

txtRel_flpth = p.relative_to(termTxtToImage_flpth)
txtRel_flpth

PosixPath('text/photosynthesis_2.txt')

In [54]:
p.stem

'photosynthesis_2'

In [43]:
fileParts = txtRel_flpth.parts
fileParts.

In [None]:
os.path.dirname("/home/ec2-user/environment/AttnGAN/data/photosynthesis/images/5. photosynthesis.png")

In [None]:
 
s3Client = boto3.client("s3")     
s3Client.Object('my-bucket-name', 'newfile.txt').put(Body=content)

In [None]:
list(labels_corpus.docs[0].sents)

In [None]:
import requests
r = requests.get("https://ssec.si.edu/stemvisions-blog/what-photosynthesis")

In [None]:
r.content