In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

%load_ext autotime

import pandas as pd
import io
import boto3
from boto3.dynamodb.conditions import Key
import os
import PIL
import json
import numpy as np
import pathlib
import pickle
import dask

from sklearn.model_selection import train_test_split

import textacy
import en_core_web_sm

import seaborn as sns

# from custo import greedy_algorithm
#import input_data_preprocessing.corpus_stats as c_stats

textacy.spacier.doc_extensions.set_doc_extensions()
en = en_core_web_sm.load()
#import code
print(textacy.__version__)
os.getcwd()


### Load TIG Data

In [None]:
%time
import pytig as ptg

### Inputs

In [None]:

data_dir_name = 'data'

metadata_folder_name = 'photosynthesis_raw'
txt_dir_flname =  "text"
img_dir_flname =  'images'

    
zip_url = f"https://github.com/gryBox/pytig-data/raw/master/{metadata_folder_name}.zip"
data_dir_flpth = os.path.abspath(data_dir_name)
print(data_dir_flpth)

metadata_flpth = os.path.join(data_dir_flpth, metadata_folder_name)
print(metadata_flpth)

text_data_flpth = os.path.join(metadata_flpth, txt_dir_flname)
image_data_flpth = os.path.join(metadata_flpth, img_dir_flname)

## Load Data

In [None]:
# Load Data
# 1. Load training data for algorithm (Needs to be done even if only using predict)
zipfile = ptg.write.zip_to_metadata_dir(zip_url, data_dir_name)

## Prepare Metadata Folder for AttnGAN
- Normalize Filenames
- Create basename file
- Pickle Files
- Sample caption files

### Prepare Filenames

In [None]:
# Prepare MetaData folder for attngan model


# 1.  Prepare Filenames
# a. Extract basenames
prpFilenames = ptg.filenames.PrepareFilenames(metadata_flpth, 
                                                   image_data_flpth, 
                                                   text_data_flpth, 
                                                   )
# b. Write new filenames back to disk using the new filenames and updates the fileNames_df 
prpFilenames.rename_filenames()
    
# c. Write basenames to a ".txt" file in the metadata folder
prpFilenames.basenames_to_txtfile(basename_flname='filenames.txt')

In [None]:
prpFilenames.fileNames_df

### Split Data

In [None]:
metaFolder = ptg.prepare_metadata_dir.Metadata(metadata_flpth, image_data_flpth, text_data_flpth)

In [None]:
train_filenames, test_filenames = metaFolder.split_data(prpFilenames.fileNames_df, 
                                                        test_size=0.3,  
                                                        filenames_clm=prpFilenames.basenameCol)

### Analyze Captions

#### photosynthesis

In [None]:
# 1. Load text captions to corpus
captCrps = ptg.write.txt_to_corpus(text_data_flpth, lang=en, txt_extention=".txt")
print(f"\nCorpus Info: {captCrps}")

# 2. Preview some docs
example_doc = 5
cap_doc = captCrps[example_doc]
print(f"\nNumber of sentences: {cap_doc._.n_sents}")
print(f"Text: {cap_doc.text}\n")

display(captCrps.docstats_df.iloc[example_doc].T)

#### birds

In [None]:
root_dir_nm = 'AttnGAN'
data_dir_nm = 'data'
metadata_dir_nm ='birds'

metadata_flpth = os.path.join("/home/ec2-user/environment/", root_dir_nm, data_dir_nm, metadata_dir_nm)
metadata_flpth

btxt_dir_flpth = os.path.join(metadata_flpth, 'text')
print(btxt_dir_flpth)
print(os.path.exists(btxt_dir_flpth))

In [None]:
# 1. Load text captions to corpus
birdsCrps = ptg.write.txt_to_corpus(btxt_dir_flpth, lang=en, txt_extention=".txt")
print(f"\nCorpus Info: {captCrps}")

#### coco

In [None]:
root_dir_nm = 'AttnGAN'
data_dir_nm = 'data'
metadata_dir_nm ='coco'

metadata_flpth = os.path.join("/home/ec2-user/environment/", root_dir_nm, data_dir_nm, metadata_dir_nm)
metadata_flpth

coco_txt_dir_flpth = os.path.join(metadata_flpth, 'text')
print(btxt_dir_flpth)
print(os.path.exists(btxt_dir_flpth))

In [None]:
# 1. Load text captions to corpus
cocoCrps = ptg.write.txt_to_corpus(coco_txt_dir_flpth, lang=en, txt_extention=".txt")
print(f"\nCorpus Info: {captCrps}")

In [None]:
from bokeh.io import output_notebook, show
output_notebook()

## Future Work

### Bounding boxes

In [None]:
# In the rest of the tutorial, we assume that the `plt`
# is imported before every code snippet.
import matplotlib.pyplot as plt

from chainercv.datasets import voc_bbox_label_names
from chainercv.links import SSD300
from chainercv.utils import read_image
from chainercv.visualizations import vis_bbox

img_name = "photosynthesis_0.jpg"
img_flpth = os.path.join(image_data_flpth, img_name)

# Read an RGB image and return it in CHW format.
img = read_image(img_flpth)
model = SSD300(pretrained_model='voc0712')
bboxes, labels, scores = model.predict([img])
vis_bbox(img, bboxes[0], labels[0], scores[0],
         label_names=voc_bbox_label_names)
plt.show()

In [None]:
os.path.join(metaFolder.metadata_flpth, 'train', "filenames.pickle")

In [None]:
train, test = train_test_split(df["filename"], test_size=0.3)

In [None]:
train.to_list()

In [None]:
 os.path.basename(metaFolder.metadata_flpth)

In [None]:
def sents_to_caption_lst(doc):
    """
    Split doc text by sentences and write to list
    """
    doc_captions_lst = [sent.text for sent in doc.sents]

    
    return doc_captions_lst


# 2. Modify training text data. i.e lowercase, ##maximize captions

# # 2.1  Read text data into a spacy corpus using textacy
crps = pmd.write.txt_to_corpus(text_training_data_flpth)

In [None]:
doc_captions_lst = sents_to_caption_lst(doc)
doc_captions_lst

In [None]:
doc._.meta[crps_file_tag]

In [None]:
text = tst_doc.text

def write_doc_to_txt(doc, crps_file_tag="file_name"):

    # Write captions for google images
    f =  open(doc._.meta[crps_file_tag], 'w')
    print(f"Number of Sentences in doc: {doc._.n_sents}")
    
    # Parse Document into sentences
    
        # f.write(label+"\n" )
        f.write(caption+" " )

    f.close()
write_doc_to_txt()

In [None]:
for doc in crps:
    

In [None]:
data_dir_name

In [None]:
data_path = os.path.join(os.path.expanduser("~"), 'imageGen/AttnGAN/data/')
print(os.path.exists(data_path))
data_path

In [None]:

# # b. Maximize the number of captions per image
reshapedCaptions = idp.captions.ReshapeImageLabels(lbls_corpus)

In [None]:
reshapedCaptions.captions_dict

In [None]:
def df_to_corpus(df):
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    records = textacy.io.split_records(img_labels, 'RESOURCE',itemwise=True)

    # Load english model
    en = en_core_web_sm.load()
    corpus = textacy.Corpus(lang=en, data=records)
    
    return corpus

In [None]:
captionsCorpus = df_to_corpus(rsrc_df)
captionsCorpus

In [None]:
for doc in captionsCorpus:
    print(doc._.preview)

### 

In [None]:




# Write new image text 

### Create Labels for text to image model

In [None]:
def handle_missing_directories(directory_flpth):
    # Handle missing Directory
    if not os.path.exists(directory_flpth):
        
        os.makedirs(directory_flpth)
        print("Made new directory: {}".format(directory_flpth))
        # print(os.path.join(dirname, flpth))
    else:
        pass
    
    return

# Create text file for each doc - Each Doc maps to an image

## TODO: incoroporate number of labals per line
def labels_to_imageTxt_files(rsrc_df, trainingData_term, trainigData_flpth='../data'):
    
    # Handle if a data directory for a term exists e.g. data/photosynthesis
    dirname = os.path.abspath('')
    termData_flpth = os.path.join(dirname, trainigData_flpth)
    handle_missing_directories(termData_flpth)
    
    
    ### Move resource df to textacy
   
    # Load into textacy to delimit sentences
    img_labels = rsrc_df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

    # Load english model
    en = en_core_web_sm.load()
    labels_corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
    
    caption_filename_path = os.path.join(trainigData_flpth, "captions.pickle")
    
    # Loop through corpus and write document to flpth (s3)
    ''' Each doc in a corpus equals and image'''
    for ix, doc in enumerate(labels_corpus):
        print("Number of Sentences: {}".format(doc.n_sents))
        
        # Paths to directories (Where to write the text files)
        filename = "{}_{}.txt".format(trainingData_term, ix)
        path_to_file = "{}/{}".format(trainigData_flpth, filename)
        
        # Write captions for google images
        f =  open(path_to_file, 'w')
        
        # Parse Document into sentences
        for sent in doc.sents:
            caption = textacy.preprocess.preprocess_text(sent.text,
                                               lowercase=True,
                                               no_punct=True
                                              )
            # f.write(label+"\n" )
            f.write(caption+" " )
            
        f.close()
        
    return ix + 1 # Count using 1 as start

# process labels for images


trainingData_term = 'photosynthesis'
txt_trainingData_flpth='{}/text'.format(termTxtToImage_flpth, 'text')

numText_files = labels_to_imageTxt_files(rsrc_df, trainingData_term, txt_trainingData_flpth)
numText_files

### Download Images from google 

In [None]:

def download_images(term ,img_args):
    
    # Download Images 
    response = google_images_download.googleimagesdownload()
    img_paths = response.download(img_args)
    
    # Post Process google image results
    for idx, f in enumerate(img_paths[term]):
        
        # Open Google image resulst and conver to jpeg
        img = PIL.Image.open(f)
        img_filetype = img.format.lower()  # 'JPEG'
        
        rgb_img = img.convert('RGB')
        img.close()
        
        # Make new filenme to allign with text file name
        filename = "{}_{}.{}".format(trainingData_term, idx, 'jpg')
        newfilepath_f = os.path.join(os.path.dirname(f), filename)
        
        # Save and image
        rgb_img.save(newfilepath_f)
        os.remove(f)
    
    
    return response 

img_args = {"keywords":"sun",
             "format": "png",
              "limit": 20,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "photosynthesis/images",
            "no_download": False,
            "extract_metadata":True
            # "size":"icon"
           }

response = download_images(trainingData_term, img_args)

### 1. Inputs

In [None]:
#import tigDataLoader.utiils as dataloader
import os
import json
import logging

In [None]:
predictTerm = "photosynthesis"
dataTerm = "photosynthesis"

In [None]:
# Base file io inputs
trainData_flpth = os.path.join('data', predictTerm)
text_flpth = os.path.join(trainData_flpth, 'text')
img_flpth =  os.path.join(trainData_flpth, 'images')

# Google imagae download metadata
imageLog_fir='logs'

In [None]:
def transformText_to_captions(txtDoc, numCaptions_per_image=5, txtSplit_method='svo'):
    # Take textacy doc and converts to a list of captions for an image
    
    if txtSplit_method=='noun_chunks':
        split_list = list(textacy.extract.noun_chunks(txtDoc, drop_determiners=False, min_freq=1))
    if txtSplit_method=='svo':
        split_list = list(textacy.extract.subject_verb_object_triples(txtDoc))
    
    return split_list

### 3. Download images
- from google

In [None]:
img_args = {"keywords": dataTerm,
             "format": "png",
              "limit": 100,
             "output_directory": 'data',
            "metadata": True,
            "image_directory": "{}/images/".format(predictTerm),
            "no_download": False,
            "extract_metadata":True,
            "type": "clipart"
           }
img_paths, response = dataloader.download_images(dataTerm, img_args)

In [None]:
# Download text from 

In [None]:
## load text into textacy
imgTxt_corpus = df_to_corpus()


In [None]:
list(labels_corpus.docs[0].sents)

In [None]:
import requests
r = requests.get("https://ssec.si.edu/stemvisions-blog/what-photosynthesis")

In [None]:
r.content

In [None]:
# Create Fake data
import shutil
import itertools

# Get a list of the original files f
txtDirFileNm_lst = glob.glob(text_flpth+"/**/*.txt", recursive=True)
imgDirFileNm_lst = glob.glob(img_flpth+"/**/*.jpg", recursive=True)

fileCombo_lst = list(itertools.product(txtDirFileNm_lst,imgDirFileNm_lst))

# for item in fileCombo_lst:
    
#     # Get text name
#     txtget_relfilename

# # # Create list paired tuple pairs
# # comboFlpth_lst = list(zip(txtDirFileNm_lst, imgDirFileNm_lst))
# # comboFlpth_lst

# # new_list = []
# # for k,v in comboFlpth_lst.iteritems():
# #     new_list.extend([x for x in combinations(v, 2)]) 
# # # for txtFile, imgFile in comboFlpth_dict:
# # #     # Make copies of files and rename
# # #     shutil.copyfile(txtFile, dst)
# # #     shutil.copyfile(imgFile, dst)

# # new_list



In [None]:
import seaborn as sns
df = pd.read_csv("RUNS.csv")
df.shape

In [None]:
sns.relplot(x="s_loss", y="embedding_dim", data=df);

# Notes

In [None]:
greedy_algorithm(captions_df['n_chars'].tolist(), ideal_caption_length)

In [None]:
greedy_algorithm(captions_df['n_chars'].tolist(), ideal_caption_length)

### Load text from s3 to df