In [1]:
%matplotlib inline
import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os
from google_images_download import google_images_download
import glob
import PIL
import json


import textacy
import en_core_web_sm

from IPython.display import SVG

from IPython.display import Image
 

# Create Training data for text to image model

## Inputs

In [2]:
tblName = "ResourceDocuments"
nodeIdentifierName = "photosynthesis-photosynthesis-photosynthesis-biology"

txtToImage_data_dir = 'data/photosynthesis'
imageLog_fir='logs'

resourceDbName = 'dynamodb'
s3Bucket = "egm-bucket/TEXT_TO_IMAGE_DATA/data"

### Get Data from Resource Db: 
photosynthesis whole

In [3]:
# Get Definitions for photosynthesis from dynamodb

## Connect to dynamodb
dynamodbClient = boto3.resource("dynamodb")
# client = boto3.client('dynamodb')
# display(client.describe_table(TableName=tblName))

## Connect to table with resources
resourceTbl = dynamodbClient.Table(tblName)
# display(resourceTbl.global_secondary_indexes)
display("NUmber of Items in ResourceDb: {}".format(resourceTbl.item_count))

'NUmber of Items in ResourceDb: 14'

### Load text into pandas 
- For data munging
    - stats
    - Duplicates

In [4]:
response = resourceTbl.query(
    IndexName='NODE_IDENTIFIER-index',
    KeyConditionExpression=Key('NODE_IDENTIFIER').eq(nodeIdentifierName)
)

# Pass through pandas for some data munging
rsrc_df = pd.DataFrame(response["Items"])
print("Db Response Shape: {}".format(rsrc_df.shape))

rsrc_df.drop_duplicates(['RESOURCE'], keep='last', inplace=True)
rsrc_df.reset_index(drop=True)

print("Db Response Shape: {}".format(rsrc_df.shape))
print(rsrc_df.columns)

Db Response Shape: (14, 13)
Db Response Shape: (7, 13)
Index(['IMAGES', 'NODE_IDENTIFIER', 'POS', 'RESOURCE', 'RESOURCE_ATTRIBUTION',
       'RESOURCE_DATATYPE', 'RESOURCE_SOURCE', 'RESOURCE_TYPE', 'RESOURCE_URL',
       'TERM', 'TIME_DOWNLOADED', 'TOPIC', 'UNIQUE_IDENTIFIER'],
      dtype='object')


In [5]:
rsrc_df["RESOURCE_SOURCE"]

4        century
5        wordnet
9     wiktionary
10     wikipedia
11         gcide
12     wikipedia
13    ahd-legacy
Name: RESOURCE_SOURCE, dtype: object

### Create Labels for text to image model

In [9]:
# Create text file for each doc - Each Doc maps to an image
def labels_to_imageTxt_files(rsrc_df, trainingData_term, flpth='../data'):
    
    ### Move resource df to textacy
   
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

    # Load english model
    en = en_core_web_sm.load()
    labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
    
    
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(labels_corpus):
        print(doc.n_sents)
        
        # Paths to directories (Where to write the text files)
        filename = "{}_{}.txt".format(trainingData_term, ix)
        path_to_file = "{}/{}".format(flpth, filename)
        
        # Handle missing Directory
        if not os.path.exists(flpth):
            dirname = os.path.abspath('')
            os.makedirs( os.path.join(dirname, flpth))
            # print(os.path.join(dirname, flpth))
        
        
        f =  open(path_to_file, 'w')
        
        # Parse Document sentences
        for sent in doc.sents:
            label = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            f.write(label+"\n" )
            
        f.close()
    return ix + 1 # Count using 1 as start

# process labels for images


trainingData_term = 'photosynthesis'
trainingData_flpth='{}/text'.format(txtToImage_data_dir, 'text')

numText_files = labels_to_imageTxt_files(rsrc_df, trainingData_term, trainingData_flpth)
numText_files

1
1
1
16
9
5
2


7

In [26]:
im = PIL.Image.open("../AttnGAN/data/birds/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0002_55.jpg")
width, height = im.size
print(width)
print(height)

500
347


### Download Images from google 

In [21]:

def download_images(term ,img_args):
    
    # Download Images 
    response = google_images_download.googleimagesdownload()
    img_paths = response.download(img_args)
    
    # Post Process google image results
    for idx, f in enumerate(img_paths[term]):
        
        
        img = PIL.Image.open(f)
        img_filetype = img.format.lower()  # 'JPEG'
        img.convert('RGB')
        
                # # Make new filenme to allign with text file name
        filename = "{}_{}.{}".format(trainingData_term, idx, 'jpg')
        
        
        
        rgb_im.save(filename)
        
        
        
        
        
        
        
        
        
        
        
        
        img.close()
        
        
        

#         newfilepath_f = os.path.join(os.path.dirname(f), filename)
#         print(newfilepath_f)
#         os.rename(f, newfilepath_f)
        
        

    
    
    
    return response 

img_args = {"keywords":"photosynthesis",
             "format": "png",
              "limit": numText_files,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "photosynthesis/images",
            "no_download": False,
            "extract_metadata":True
            # "size":"icon"
           }

response = download_images(trainingData_term, img_args)


Item no.: 1 --> Item name = photosynthesis
Evaluating...
Starting Download...

Image Metadata: {'image_format': 'png', 'image_height': 220, 'image_width': 220, 'image_link': 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/55/Photosynthesis_en.svg/220px-Photosynthesis_en.svg.png', 'image_description': 'Photosynthesis - Wikipedia', 'image_host': 'en.wikipedia.org', 'image_source': 'https://en.wikipedia.org/wiki/Photosynthesis', 'image_thumbnail_url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS5KkJVlZz81zgJD2edw3thRhA_3dLLlsBtIpyz3qStyfqmoyI6'}
Completed Image ====> 1. 220px-photosynthesis_en.svg.png

Image Metadata: {'image_format': 'png', 'image_height': 421, 'image_width': 575, 'image_link': 'https://ssec.si.edu/sites/default/files/Photosynthesis_0.png', 'image_description': 'What is Photosynthesis | Smithsonian Science Education Center', 'image_host': 'ssec.si.edu', 'image_source': 'https://ssec.si.edu/stemvisions-blog/what-photosynthesis', 'image_thumbnail_url':

In [16]:
img = PIL.Image.open("/home/ec2-user/environment/AttnGAN/data/photosynthesis/images/5. photosynthesis.png")
print(img.format)  # 'JPEG'

PNG


In [18]:
os.path.dirname("/home/ec2-user/environment/AttnGAN/data/photosynthesis/images/5. photosynthesis.png")

'/home/ec2-user/environment/AttnGAN/data/photosynthesis/images'

In [None]:
 
s3Client = boto3.client("s3")     
s3Client.Object('my-bucket-name', 'newfile.txt').put(Body=content)

In [None]:
list(labels_corpus.docs[0].sents)

In [None]:
import requests
r = requests.get("https://ssec.si.edu/stemvisions-blog/what-photosynthesis")

In [None]:
r.content