In [1]:
%matplotlib inline
import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os

import textacy
import en_core_web_sm

from IPython.display import SVG

from IPython.display import Image
 

# Create Training data for text to image model

## Inputs

In [2]:
tblName = "ResourceDocuments"
nodeIdentifierName = "photosynthesis-photosynthesis-photosynthesis-biology"

resourceDbName = 'dynamodb'
s3Bucket = "egm-bucket/TEXT_TO_IMAGE_DATA"

### Get Data from Resource Db: 
photosynthesis whole

In [3]:
# Get Definitions for photosynthesis from dynamodb

## Connect to dynamodb
dynamodbClient = boto3.resource("dynamodb")
# client = boto3.client('dynamodb')
# display(client.describe_table(TableName=tblName))

## Connect to table with resources
resourceTbl = dynamodbClient.Table(tblName)
# display(resourceTbl.global_secondary_indexes)
display(resourceTbl.item_count)

14

### Load into pandas 
- For data munging
    - stats
    - Duplicates

In [4]:
response = resourceTbl.query(
    IndexName='NODE_IDENTIFIER-index',
    KeyConditionExpression=Key('NODE_IDENTIFIER').eq(nodeIdentifierName)
)

# Pass through pandas for some data munging
rsrc_df = pd.DataFrame(response["Items"])
print("Db Response Shape: {}".format(rsrc_df.shape))

rsrc_df.drop_duplicates(['RESOURCE'], keep='last', inplace=True)
rsrc_df.reset_index(drop=True)

print("Db Response Shape: {}".format(rsrc_df.shape))
print(rsrc_df.columns)

Db Response Shape: (14, 13)
Db Response Shape: (7, 13)
Index(['IMAGES', 'NODE_IDENTIFIER', 'POS', 'RESOURCE', 'RESOURCE_ATTRIBUTION',
       'RESOURCE_DATATYPE', 'RESOURCE_SOURCE', 'RESOURCE_TYPE', 'RESOURCE_URL',
       'TERM', 'TIME_DOWNLOADED', 'TOPIC', 'UNIQUE_IDENTIFIER'],
      dtype='object')


In [5]:
rsrc_df["RESOURCE_SOURCE"]

4        century
5        wordnet
9     wiktionary
10     wikipedia
11         gcide
12     wikipedia
13    ahd-legacy
Name: RESOURCE_SOURCE, dtype: object

### Move resource df to textacy
- for text processing
- building labels for images

In [6]:
# Make labels for images
img_labels = rsrc_df.to_dict(orient="records")
text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

# Load english model
en = en_core_web_sm.load()
labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
labels_corpus

Corpus(7 docs; 963 tokens)

In [7]:
labels_corpus.n_docs

7

In [8]:
list(labels_corpus.get(lambda x: x.metadata['RESOURCE_SOURCE'] == 'wikipedia'))

[Doc(512 tokens; "Photosynthesis is a process used by plants and ..."),
 Doc(140 tokens; "Photosynthesis is a process used by plants and ...")]

In [67]:
# Create text file for each doc - Each Doc maps to an image
def labels_to_imageTxt_files(corpus, term, flpth='../data'):
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(corpus):
        print(doc.n_sents)
        
        # Write to file stuff (paths)
        filename = "{}_{}.txt".format(term, ix)
        path_to_file = "{}/{}".format(flpth, filename)
        
        # Handle missing Directory
        if not os.path.exists(flpth):
            dirname = os.path.abspath('')
            os.makedirs( os.path.join(dirname, flpth))
            print(os.path.join(dirname, flpth))
        
        
        f =  open(path_to_file, 'w')
        
        # Prepeare labals for an image
        for sent in doc.sents:
            label = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            f.write(label+"\n" )
            
        f.close()
    return ix


In [68]:
os.path.abspath('')

'/home/ec2-user/environment/AttnGAN'

In [69]:
# Process resourceDb for image to text model

term = 'photosynthesis'
flpth='data/photosynthesis/text'
sio = labels_to_imageTxt_files(labels_corpus, term, flpth)

1
/home/ec2-user/environment/AttnGAN/data/photosynthesis/text
1
1
16
9
5
2


In [38]:
sio.seek(0)

0

In [40]:
sio.read()

'the process in green plants and certain other organisms by which carbohydrates are synthesized from carbon dioxide and water using light as an energy source\nmost forms of photosynthesis release oxygen as a byproduct\n'

In [21]:
f.read()


''

In [11]:
doc = list(labels_corpus.get(lambda x: x.metadata['RESOURCE_SOURCE'] == 'wikipedia'))[0]
list(doc.sents)

[Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy that can later be released to fuel the organisms' activities.,
 This chemical energy is stored in carbohydrate molecules, such as sugars, which are synthesized from carbon dioxide and water – hence the name photosynthesis, from the Greek φῶς, phōs, "light", and σύνθεσις, synthesis, "putting together".,
 In most cases, oxygen is also released as a waste product.,
 Most plants, most algae, and cyanobacteria perform photosynthesis; such organisms are called photoautotrophs.,
 Photosynthesis is largely responsible for producing and maintaining the oxygen content of the Earth's atmosphere, and supplies all of the organic compounds and most of the energy necessary for life on Earth.,
 Although photosynthesis is performed differently by different species, the process always begins when energy from light is absorbed by proteins called reaction centres that contain green chlorophyll pigm

In [30]:
for sent in doc.sents:
    sent

In [35]:
s

512

In [None]:
 s3.Object('my-bucket-name', 'newfile.txt').put(Body=content)

In [None]:
 
s3Client = boto3.client("s3")     
s3Client.Object('my-bucket-name', 'newfile.txt').put(Body=content)

In [None]:
list(labels_corpus.docs[0].sents)