In [1]:
%matplotlib inline
import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os
from google_images_download import google_images_download


import textacy
import en_core_web_sm

from IPython.display import SVG

from IPython.display import Image
 

# Create Training data for text to image model

## Inputs

In [2]:
tblName = "ResourceDocuments"
nodeIdentifierName = "photosynthesis-photosynthesis-photosynthesis-biology"

txtToImage_data_dir = 'data/photosynthesis'

resourceDbName = 'dynamodb'
s3Bucket = "egm-bucket/TEXT_TO_IMAGE_DATA/data"

### Get Data from Resource Db: 
photosynthesis whole

In [3]:
# Get Definitions for photosynthesis from dynamodb

## Connect to dynamodb
dynamodbClient = boto3.resource("dynamodb")
# client = boto3.client('dynamodb')
# display(client.describe_table(TableName=tblName))

## Connect to table with resources
resourceTbl = dynamodbClient.Table(tblName)
# display(resourceTbl.global_secondary_indexes)
display(resourceTbl.item_count)

14

### Load into pandas 
- For data munging
    - stats
    - Duplicates

In [4]:
response = resourceTbl.query(
    IndexName='NODE_IDENTIFIER-index',
    KeyConditionExpression=Key('NODE_IDENTIFIER').eq(nodeIdentifierName)
)

# Pass through pandas for some data munging
rsrc_df = pd.DataFrame(response["Items"])
print("Db Response Shape: {}".format(rsrc_df.shape))

rsrc_df.drop_duplicates(['RESOURCE'], keep='last', inplace=True)
rsrc_df.reset_index(drop=True)

print("Db Response Shape: {}".format(rsrc_df.shape))
print(rsrc_df.columns)

Db Response Shape: (14, 13)
Db Response Shape: (7, 13)
Index(['IMAGES', 'NODE_IDENTIFIER', 'POS', 'RESOURCE', 'RESOURCE_ATTRIBUTION',
       'RESOURCE_DATATYPE', 'RESOURCE_SOURCE', 'RESOURCE_TYPE', 'RESOURCE_URL',
       'TERM', 'TIME_DOWNLOADED', 'TOPIC', 'UNIQUE_IDENTIFIER'],
      dtype='object')


In [5]:
rsrc_df["RESOURCE_SOURCE"]

4        century
5        wordnet
9     wiktionary
10     wikipedia
11         gcide
12     wikipedia
13    ahd-legacy
Name: RESOURCE_SOURCE, dtype: object

### Move resource df to textacy
- for text processing
- building labels for images

In [6]:
# Make labels for images
img_labels = rsrc_df.to_dict(orient="records")
text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

# Load english model
en = en_core_web_sm.load()
labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
labels_corpus

Corpus(7 docs; 963 tokens)

In [7]:
list(labels_corpus.get(lambda x: x.metadata['RESOURCE_SOURCE'] == 'wikipedia'))

[Doc(512 tokens; "Photosynthesis is a process used by plants and ..."),
 Doc(140 tokens; "Photosynthesis is a process used by plants and ...")]

In [8]:
# Create text file for each doc - Each Doc maps to an image
def labels_to_imageTxt_files(corpus, term, flpth='../data'):
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(corpus):
        print(doc.n_sents)
        
        # Write to file stuff (paths)
        filename = "{}_{}.txt".format(term, ix)
        path_to_file = "{}/{}".format(flpth, filename)
        
        # Handle missing Directory
        if not os.path.exists(flpth):
            dirname = os.path.abspath('')
            os.makedirs( os.path.join(dirname, flpth))
            print(os.path.join(dirname, flpth))
        
        
        f =  open(path_to_file, 'w')
        
        # Prepeare labals for an image
        for sent in doc.sents:
            label = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            f.write(label+"\n" )
            
        f.close()
    return ix + 1 # Count using 1 as start

# process labels for images

term = 'photosynthesis'
flpth='{}/text'.format(txtToImage_data_dir, 'text')
numText_files = labels_to_imageTxt_files(labels_corpus, term, flpth)
numText_files

1
1
1
16
9
5
2


7

### Download Images from google 

In [9]:
'{}/{}'.format(txtToImage_data_dir, 'images')

'data/photosynthesis/images'

In [10]:
img_args = {"keywords":"photosynthesis",
             "format": "png",
             "limit": numText_files,
             "output_directory": 'data',
            "image_directory": "photosynthesis/images"
           }

response = google_images_download.googleimagesdownload()
img_paths = response.download(img_args)


Item no.: 1 --> Item name = photosynthesis
Evaluating...
Starting Download...
Completed Image ====> 1. 220px-photosynthesis_en.svg.png
Completed Image ====> 2. photosynthesis_0.png
Completed Image ====> 3. b5696ba86426f4fcc8be09e1a910f0033d241d24.png
Completed Image ====> 4. diagram-of-photosynthesis.png
Completed Image ====> 5. photosynthesis.png
Completed Image ====> 6. f-d%3a283a5747964474bdb6059d1e0
Completed Image ====> 7. 450px-thylakoid_membrane_3.svg.png

Errors: 0



In [None]:
absolute_image_paths.items()

In [None]:
 
s3Client = boto3.client("s3")     
s3Client.Object('my-bucket-name', 'newfile.txt').put(Body=content)

In [None]:
list(labels_corpus.docs[0].sents)