In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import os
import boto3
import timeit

In [2]:
def s3_model_data(meta_data):
    
    """
    Take a meta_data dataframe and create train/val/test 
    directories on s3 and corresponding .lst files for 
    each country in the meta_data dataframe.
    """
    
    start = timeit.default_timer()
    
    # Set boto3 resource
    s3 = boto3.resource("s3")
    
    # Parse and lower country names
    meta_data["countries_parsed"] = meta_data["countries"].map(lambda x: x.lstrip("['").rstrip("'']").lower())
    
    # Add png to path
    meta_data["path"] = meta_data["filename"].map(lambda x: x + ".png")
    
    # Add filename only
    meta_data["name"] = meta_data["path"].map(lambda x: x.split("/")[-1])
    
    # Get unique list of countries
    #country_list = [c for c in meta_data["countries_parsed"].unique()]
    country_list = ["bangladesh"]
    
    # Add unique ID
    meta_data['id'] = range(0, len(meta_data))

    for country in (country_list):
        print(country)
        
        # Get test/train images
        test = meta_data[meta_data["countries_parsed"] == country]
        rest = meta_data[meta_data["countries_parsed"] != country]
        
        # Shuffle test data
        test = shuffle(test)
        
        # Get shuffled 80/20 train/validation split
        train, val = train_test_split(rest, test_size=0.2)
        
        image_set = {"test": test,
                     "train": train,
                     "val": val}
                     
        for key, value in image_set.items():
            for index, row in value.head(1000).iterrows():

                copy_source = {
                    "Bucket": "w210-poverty-mapper",
                    "Key": row["path"]
                }
                s3.meta.client.copy(copy_source, "w210-poverty-mapper", "modeling/images/" + country +  "_" + key + "/" + row["name"])
                
            # Keep relevant columns
            subset = value[["id", "bin", "name"]]
            
            output_path = "s3://w210-poverty-mapper/modeling/metadata/lsts/"
            
            # Set output path
            path = output_path + country + "_" + key + "_lst/"
            
            # Write .lst file
            subset.head(1000).to_csv(path + country + "_" + key + ".lst", index=False, sep ="\t", header=False)
            print("wrote {} to s3".format(path))
    
    stop = timeit.default_timer()
    print('Time: ', round((stop - start)/60, 2))
            

In [3]:
meta_data = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/meta_data_final.csv")

In [4]:
s3_model_data(meta_data)

bangladesh
wrote s3://w210-poverty-mapper/modeling/metadata/lsts/bangladesh_test_lst/ to s3
wrote s3://w210-poverty-mapper/modeling/metadata/lsts/bangladesh_train_lst/ to s3
wrote s3://w210-poverty-mapper/modeling/metadata/lsts/bangladesh_val_lst/ to s3
Time:  9.24
