In [34]:
%matplotlib inline
%load_ext autoreload
%autoreload 2




import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os
from google_images_download import google_images_download
import glob
import PIL
import json
import numpy as np
import pathlib
import pickle

import textacy
import en_core_web_sm

# from custo import greedy_algorithm
import input_data_preprocessing.corpus_stats as c_stats

textacy.spacier.doc_extensions.set_doc_extensions()
#import code
textacy.__version__

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'0.7.0'

# Get Text Data for inputTerm

## Inputs

In [5]:
inputTerm = 'photosynthesis'
topic ='biology'


tblName = "ResourceDocuments"
nodeIdentifierName = "photosynthesis-photosynthesis-photosynthesis-biology"

termTxtToImage_flpth = 'data/photosynthesis'
text_flpth = os.path.join(termTxtToImage_flpth, 'text')
img_flpth =  os.path.join(termTxtToImage_flpth, 'images')
imageLog_fir='logs'

resourceDbName = 'dynamodb'
#s3Bucket = "egm-bucket/TEXT_TO_IMAGE_DATA/data"

### Get Data from Resource Db: 
photosynthesis whole

In [6]:
# Get Definitions for photosynthesis from dynamodb

## Connect to dynamodb
dynamodbClient = boto3.resource("dynamodb")
# client = boto3.client('dynamodb')
# display(client.describe_table(TableName=tblName))

## Connect to table with resources
resourceTbl = dynamodbClient.Table(tblName)
# display(resourceTbl.global_secondary_indexes)
display("NUmber of Items in ResourceDb: {}".format(resourceTbl.item_count))

'NUmber of Items in ResourceDb: 14'

### Load text into pandas 
- For data munging
    - stats
    - Duplicates

In [7]:
response = resourceTbl.query(
    IndexName='NODE_IDENTIFIER-index',
    KeyConditionExpression=Key('NODE_IDENTIFIER').eq(nodeIdentifierName)
)

# Pass through pandas for some data munging
rsrc_df = pd.DataFrame(response["Items"])
print("Db Response Shape: {}".format(rsrc_df.shape))

rsrc_df.drop_duplicates(['RESOURCE'], keep='last', inplace=True)
rsrc_df.reset_index(drop=True)

print("Db Response Shape: {}".format(rsrc_df.shape))
print(rsrc_df.columns)

Db Response Shape: (14, 13)
Db Response Shape: (7, 13)
Index(['IMAGES', 'NODE_IDENTIFIER', 'POS', 'RESOURCE', 'RESOURCE_ATTRIBUTION',
       'RESOURCE_DATATYPE', 'RESOURCE_SOURCE', 'RESOURCE_TYPE', 'RESOURCE_URL',
       'TERM', 'TIME_DOWNLOADED', 'TOPIC', 'UNIQUE_IDENTIFIER'],
      dtype='object')


In [8]:
rsrc_df["RESOURCE_SOURCE"]

4        century
5        wordnet
9     wiktionary
10     wikipedia
11         gcide
12     wikipedia
13    ahd-legacy
Name: RESOURCE_SOURCE, dtype: object

### Ingest corpus data from df

In [9]:
def df_to_corpus(df):
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    records = textacy.io.split_records(img_labels, 'RESOURCE',itemwise=True)

    # Load english model
    en = en_core_web_sm.load()
    corpus = textacy.Corpus(lang=en, data=records)
    
    return corpus

In [10]:
captionsCorpus = df_to_corpus(rsrc_df)
captionsCorpus

Corpus(7 docs, 963 tokens)

In [11]:
for doc in captionsCorpus:
    print(doc._.preview)

Doc(37 tokens: "A chemical combination brought about by the act...")
Doc(14 tokens: "synthesis of compounds with the aid of radiant ...")
Doc(24 tokens: "The process by which plants and other photoauto...")
Doc(512 tokens: "Photosynthesis is a process used by plants and ...")
Doc(200 tokens: "The process of constructive metabolism by which...")
Doc(140 tokens: "Photosynthesis is a process used by plants and ...")
Doc(36 tokens: "The process in green plants and certain other o...")


### Split Corpus image text into multiple captions

In [47]:
import input_data_preprocessing.corpus_stats as c_stats
import input_data_preprocessing.utils as idp_utils
import input_data_preprocessing.captions as idp_captions

# Input - A Spacy corpus of texts corresponding to a set of images
print(f"Input Corpus: {captionsCorpus}")

# 1  Calculate corpus stats
corpusStats = c_stats.CorpusStats(captionsCorpus)

# 2  Extract the shortest doc from a corpus
shortestDoc = idp_utils.find_shortest_doc(captionsCorpus, corpusStats.min_tokens)

# 3  Maximize the shortest doc captions and return a list of captions greater than entered
shortestDocCaptions = idp_captions.MaximizeDocCaptions(shortestDoc )

# 4  Segment all the texts in the corpus to equal the number of captions in the shortest doc



DEBUG:root:Finished calculating CorpusStats
INFO:root:Shortest Doc - Number of tokens 14
DEBUG:root:Finished - Maximizimizing number of captions per image
INFO:root:MaximizeDocCaptions - Number of captions 5


Input Corpus: Corpus(7 docs, 963 tokens)


In [48]:
shortestDocCaptions.caption_lst

['synthesis of compounds',
 'aid of radiant',
 'especially in plants',
 'compounds with the aid',
 'aid of radiant energy']

In [82]:

# # 2 Check if we can expand the captions for text less than 2 sents long
# if corpusStats.min_sents <= 2: # TODO add check for all the captions.
     
max_captions = shortestDocCaptions.num_of_captions




corpus_captions_lst = list()
to_shortDocs_lst = list()
# d. Loop over corpus and re-size labels to the max captions i.e ideally split by sentences
for docidx, doc in enumerate(captionsCorpus):
    print(f"\n Resizing Image Caption {docidx}")
    image_captions_lst = list()
    # Check if the doc the labaels need to be minimized or expanded
    if doc._.n_sents==max_captions:
        
        # Split sents into captions and noramlize text i.e lowercase everything.
        image_captions_lst = [idp_captions.normalize_caption_text(sent.text) for sent in doc.sents]
    
    elif doc._.n_sents<max_captions:
        
        # Maximize the captions per image
        maxedCaptions = idp_captions.MaximizeDocCaptions(doc)
#         captions_df  =  idp_utils.captionsLst_to_df(maxedCaptions.caption_lst)
        
#         # Minimize if maxed captions 
#         docCaptions = idp_captions.MinimizeDocCaptions(captions_df, max_captions)
        image_captions_lst = docCaptions.captions_lst
    elif doc._.n_sents>max_captions:
        
        # Split sentences into list  before munging
        captions_lst = [sent.text for sent in doc.sents]        
        
        
        docCaptions = idp_captions.MinimizeDocCaptions(sents_df, max_captions)
        image_captions_lst = docCaptions.captions_lst
            
    corpus_captions_lst.append(image_captions_lst)    

DEBUG:root:Finished - Maximizimizing number of captions per image
INFO:root:MaximizeDocCaptions - Number of captions 14
DEBUG:root:Finished - Maximizimizing number of captions per image
INFO:root:MaximizeDocCaptions - Number of captions 5
DEBUG:root:Finished - Maximizimizing number of captions per image
INFO:root:MaximizeDocCaptions - Number of captions 12
DEBUG:root:Ideal caption length: 483
DEBUG:root:Ideal caption length: 193
DEBUG:root:Finished - Maximizimizing number of captions per image
INFO:root:MaximizeDocCaptions - Number of captions 23



 Resizing Image Caption 0

 Resizing Image Caption 1

 Resizing Image Caption 2

 Resizing Image Caption 3
Number of characters per captions: 554
Number of characters per captions: 412
Number of characters per captions: 302
Number of characters per captions: 320
Number of characters per captions: 412

 Resizing Image Caption 4
Number of characters per captions: 202
Number of characters per captions: 142
Number of characters per captions: 208
Number of characters per captions: 137
Number of characters per captions: 163

 Resizing Image Caption 5

 Resizing Image Caption 6


In [83]:
docCaptions.captions_lst

['the process of constructive metabolism by which carbohydrates are formed from water vapor and the carbon dioxide of the air in the chlorophyll containing tissues of plants exposed to the action of light',
 'it was formerly called assimilation but this is now commonly used as in animal physiology the details of the process are not yet clearly known',
 'baeyer s theory is that the carbon dioxide is reduced to carbon monoxide which uniting with the hydrogen of the water in the cell produces formaldehyde the latter forming various sugars through polymerization',
 'vines suggests that the carbohydrates are secretion products of the chloroplasts derived from decomposition of previously formed proteids',
 'the food substances are usually quickly translocated those that accumulate being changed to starch which appears in the cells almost simultaneously with the sugars']

In [70]:
to_shortDocs_lst
        

[A chemical combination brought about by the action of light, as in the formation of carbohydrates in living plants from the carbon di-oxid and water of the air under the influence of sunlight.,
 synthesis of compounds with the aid of radiant energy (especially in plants),
 The process by which plants and other photoautotrophs generate carbohydrates and oxygen from carbon dioxide, water, and light energy in chloroplasts.,
 The process in green plants and certain other organisms by which carbohydrates are synthesized from carbon dioxide and water using light as an energy source. Most forms of photosynthesis release oxygen as a byproduct.]

In [None]:
def minimize_captions(doc,  min_objective='shape'):
    
    if min_objective=='shape':
        pass
    elif min_objective=='similarity':
        pass

In [None]:
for sent in doc.sents:
    print(type(sent.text))
    x = textacy.preprocess.preprocess_text(str(sent.text), fix_unicode=True, lowercase=False, 
                                       no_urls=False, no_emails=False, 
                                       no_phone_numbers=False, no_numbers=False, 
                                       no_currency_symbols=False, no_punct=False, 
                                       no_contractions=False, 
                                       no_accents=False)

In [None]:
textacy.preprocess.preprocess_text(captions_lst[0], fix_unicode=True, lowercase=False, 
                                       no_urls=False, no_emails=False, 
                                       no_phone_numbers=False, no_numbers=False, 
                                       no_currency_symbols=False, no_punct=False, 
                                       no_contractions=False, 
                                       no_accents=False)

In [None]:
type(captions_lst[0])

In [None]:
list(doc._.n_sents for doc in captionsCorpus)

In [None]:
captions_lst

In [None]:


shortestDocs = captionsCorpus.get(lambda x: len(x)==minToken_num)
shortestDoc = list(shortestDocs)[0]

# 2  Get a list of captions parsed from the doc with the min amount of tokens
captions_lst = maximize_captions(shortestDoc)
max_captions = len(captions_lst)

# 3 Go through the rest of the documents in the corpus 

In [None]:
captions_lst

In [None]:
pattern = textacy.constants.POS_REGEX_PATTERNS["en"]["PP"]
pattern

In [None]:
list(textacy.extract.pos_regex_matches(shortestDoc, p1))

In [None]:
p1 = '<NOUN> <ADP> <NOUN> ' 

In [None]:
list(textacy.extract.pos_regex_matches(shortestDoc, p1))

In [None]:
shortestDoc._.to_tokenized_text

In [None]:
captionsCorpus[1]._.to_bag_of_terms(ngrams=(4), 
                              entities=True, 
                              weighting="binary",
                             as_strings=True)

In [None]:
list(shortestDoc._.to_terms_list(ngrams=(3,4), 
                                 normalize = 'lower',
                                 entities=True, 
                              weighting="binary",
                                 filter_punct = True,
                                 drop_determiners = True,
                             as_strings=True))

In [None]:
list(shortestDoc._.to_terms_list(ngrams=(4), 
                                 normalize = 'lower',
                                 entities=True, 
                              weighting="binary",
                                 filter_punct = True,
                                 drop_determiners = True,
                             as_strings=True))

In [None]:

textacy.spacier.doc_extensions.get_doc_extensions()


In [None]:
shortestDoc

In [None]:
list(textacy.extract.words(shortestDoc, 
                            filter_stops=False, 
                            filter_punct=True, 
                            filter_nums=False,
                            min_freq=1))

In [None]:
list(textacy.extract.noun_chunks(shortestDoc, drop_determiners=False, min_freq=0))

### 2. Make captions for images

In [None]:
shortestDoc._.meta

In [None]:
list(textacy.extract.noun_chunks(shortestDoc, drop_determiners=False, min_freq=1))

### Create Labels for text to image model

In [None]:
def handle_missing_directories(directory_flpth):
    # Handle missing Directory
    if not os.path.exists(directory_flpth):
        
        os.makedirs(directory_flpth)
        print("Made new directory: {}".format(directory_flpth))
        # print(os.path.join(dirname, flpth))
    else:
        pass
    
    return

# Create text file for each doc - Each Doc maps to an image

## TODO: incoroporate number of labals per line
def labels_to_imageTxt_files(rsrc_df, trainingData_term, trainigData_flpth='../data'):
    
    # Handle if a data directory for a term exists e.g. data/photosynthesis
    dirname = os.path.abspath('')
    termData_flpth = os.path.join(dirname, trainigData_flpth)
    handle_missing_directories(termData_flpth)
    
    
    ### Move resource df to textacy
   
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

    # Load english model
    en = en_core_web_sm.load()
    labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
    
    caption_filename_path = os.path.join(trainigData_flpth, "captions.pickle")
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(labels_corpus):
        print("Number of Sentences: {}".format(doc.n_sents))
        
        # Paths to directories (Where to write the text files)
        filename = "{}_{}.txt".format(trainingData_term, ix)
        path_to_file = "{}/{}".format(trainigData_flpth, filename)
        
        # Write captions for google images
        f =  open(path_to_file, 'w')
        
        # Parse Document into sentences
        for sent in doc.sents:
            caption = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            # f.write(label+"\n" )
            f.write(caption+" " )
            
        f.close()
        
    return ix + 1 # Count using 1 as start

# process labels for images


trainingData_term = 'photosynthesis'
txt_trainingData_flpth='{}/text'.format(termTxtToImage_flpth, 'text')

numText_files = labels_to_imageTxt_files(rsrc_df, trainingData_term, txt_trainingData_flpth)
numText_files

### Download Images from google 

In [None]:

def download_images(term ,img_args):
    
    # Download Images 
    response = google_images_download.googleimagesdownload()
    img_paths = response.download(img_args)
    
    # Post Process google image results
    for idx, f in enumerate(img_paths[term]):
        
        # Open Google image resulst and conver to jpeg
        img = PIL.Image.open(f)
        img_filetype = img.format.lower()  # 'JPEG'
        
        rgb_img = img.convert('RGB')
        img.close()
        
        # Make new filenme to allign with text file name
        filename = "{}_{}.{}".format(trainingData_term, idx, 'jpg')
        newfilepath_f = os.path.join(os.path.dirname(f), filename)
        
        # Save and image
        rgb_img.save(newfilepath_f)
        os.remove(f)
    
    
    return response 

img_args = {"keywords":"sun",
             "format": "png",
              "limit": 20,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "photosynthesis/images",
            "no_download": False,
            "extract_metadata":True
            # "size":"icon"
           }

response = download_images(trainingData_term, img_args)

### 1. Inputs

In [None]:
#import tigDataLoader.utiils as dataloader
import os
import json
import logging

In [None]:
predictTerm = "photosynthesis"
dataTerm = "photosynthesis"

In [None]:
# Base file io inputs
trainData_flpth = os.path.join('data', predictTerm)
text_flpth = os.path.join(trainData_flpth, 'text')
img_flpth =  os.path.join(trainData_flpth, 'images')

# Google imagae download metadata
imageLog_fir='logs'

In [None]:
def transformText_to_captions(txtDoc, numCaptions_per_image=5, txtSplit_method='svo'):
    # Take textacy doc and converts to a list of captions for an image
    
    if txtSplit_method=='noun_chunks':
        split_list = list(textacy.extract.noun_chunks(txtDoc, drop_determiners=False, min_freq=1))
    if txtSplit_method=='svo':
        split_list = list(textacy.extract.subject_verb_object_triples(txtDoc))
    
    return split_list

### 3. Download images
- from google

In [None]:
img_args = {"keywords": dataTerm,
             "format": "png",
              "limit": 100,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "{}/images/".format(predictTerm),
            "no_download": False,
            "extract_metadata":True,
            "type": "clipart"
           }
img_paths, response = dataloader.download_images(dataTerm, img_args)

In [None]:
# Download text from 

In [None]:
## load text into textacy
imgTxt_corpus = df_to_corpus()


In [None]:
list(labels_corpus.docs[0].sents)

In [None]:
import requests
r = requests.get("https://ssec.si.edu/stemvisions-blog/what-photosynthesis")

In [None]:
r.content

In [None]:
# Create Fake data
import shutil
import itertools

# Get a list of the original files f
txtDirFileNm_lst = glob.glob(text_flpth+"/**/*.txt", recursive=True)
imgDirFileNm_lst = glob.glob(img_flpth+"/**/*.jpg", recursive=True)

fileCombo_lst = list(itertools.product(txtDirFileNm_lst,imgDirFileNm_lst))

# for item in fileCombo_lst:
    
#     # Get text name
#     txtget_relfilename

# # # Create list paired tuple pairs
# # comboFlpth_lst = list(zip(txtDirFileNm_lst, imgDirFileNm_lst))
# # comboFlpth_lst

# # new_list = []
# # for k,v in comboFlpth_lst.iteritems():
# #     new_list.extend([x for x in combinations(v, 2)]) 
# # # for txtFile, imgFile in comboFlpth_dict:
# # #     # Make copies of files and rename
# # #     shutil.copyfile(txtFile, dst)
# # #     shutil.copyfile(imgFile, dst)

# # new_list



In [None]:
import seaborn as sns
df = pd.read_csv("RUNS.csv")
df.shape

In [None]:
sns.relplot(x="s_loss", y="embedding_dim", data=df);

# Notes

In [None]:
greedy_algorithm(captions_df['n_chars'].tolist(), ideal_caption_length)

In [None]:
greedy_algorithm(captions_df['n_chars'].tolist(), ideal_caption_length)

### Load text from s3 to df