# Sandbox for FilesDataConnector

In [5]:
from great_expectations.execution_environment import *
from great_expectations.execution_environment.execution_environment import *

from great_expectations.execution_environment.data_connector import *


In [6]:
import yaml

In [7]:
# loading parameters
execution_engine = {
        "class_name": "PandasExecutionEngine",
        "module_name": "great_expectations.execution_engine.pandas_execution_engine",
    }


yaml_str_nested = f"""
my_files_connector:
    class_name: FilesDataConnector
    base_directory: /Users/work/Development/GE_Data/NestedDirectory/
    asset_param:
        test_assets:
            partition_regex: (.*)/file.*
            partition_param: ["subdir_name"]
            partition_delimiter: "-"
            reader_method: read_csv
"""




yaml_world_cup = f"""
my_files_connector:
    class_name: FilesDataConnector
    base_directory: /Users/work/Development/GE_Data/world-cup-predictions/
    asset_param:
        test_assets:
            #partition_regex: (.*)/file.*
            #partition_param: ["subdir_name"]
            #partition_delimiter: "-"
            reader_method: read_csv
"""



yaml_march_madness = f"""
my_files_connector:
    class_name: FilesDataConnector
    base_directory: /Users/work/Development/GE_Data/march-madness-predictions-2015/
    asset_param:
        test_assets:
            partition_regex: (.*)/bracket.*
            partition_param: ["gender"]
            partition_delimiter: "-"
            reader_method: read_csv
"""





data_connectors = yaml.safe_load(yaml_march_madness)

In [8]:
execution_environment = ExecutionEnvironment(
    name="foo", execution_engine=execution_engine, data_connectors=data_connectors
)

In [9]:
def test_data_connector_config(my_connector):
    data_asset_names = my_connector.get_available_data_asset_names()
    print("=== Data asset names ===")
    for key in data_asset_names.keys():
        print(key)
        asset_list = data_asset_names[key]
        for asset in asset_list:
            print("\t" + asset)
    
    print("\n=== Partitions ===")
    for data_asset_name in data_asset_names:
        partitions = my_connector.get_available_partitions(data_asset_name=data_asset_name)
        for partition in partitions:
            print(str(data_asset_name) + " : " + str(partition))

In [10]:
my_connector = execution_environment.get_data_connector("my_files_connector")
#result_we_get = my_connector.get_available_partitions(data_asset_name="test_assets")
test_data_connector_config(my_connector)


=== Data asset names ===
test_assets
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-00-no-moore.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-00.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-01.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-02.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-03.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-04.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-05.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-06.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-07.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-08.tsv
	/Users/work/Development/GE_Data/march-madness-predictions-2015/mens/bracket-09.tsv
	/Users/work/Development/GE_Da

In [24]:
result_we_get = my_connector.get_available_partition_ids(data_asset_name="test_assets")
result_we_get

['100', '100', '100', '101', '101', '101', '102', '102', '102']

In [25]:
result_we_get = my_connector.get_available_partition_definitions(data_asset_name="test_assets")
result_we_get

[{'subdir_name': '100'},
 {'subdir_name': '100'},
 {'subdir_name': '100'},
 {'subdir_name': '101'},
 {'subdir_name': '101'},
 {'subdir_name': '101'},
 {'subdir_name': '102'},
 {'subdir_name': '102'},
 {'subdir_name': '102'}]

# Experiments

In [49]:
import glob
import os
import re

In [50]:
glob.glob("/Users/work/Development/GE_Data/NestedDirectory" + "/*.csv", recursive=True)

[]

In [51]:
glob.glob("/Users/work/Development/GE_Data/NestedDirectory" + "/**", recursive=True)

['/Users/work/Development/GE_Data/NestedDirectory/',
 '/Users/work/Development/GE_Data/NestedDirectory/102',
 '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_1.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_3.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_2.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/100',
 '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_1.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_3.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_2.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/101',
 '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_1.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_3.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_2.csv']

In [52]:
glob.glob("/Users/work/Development/GE_Data/NestedDirectory" + "/**")

['/Users/work/Development/GE_Data/NestedDirectory/102',
 '/Users/work/Development/GE_Data/NestedDirectory/100',
 '/Users/work/Development/GE_Data/NestedDirectory/101']

In [53]:
glob.glob("/Users/work/Development/GE_Data/NestedDirectory/(.*)/file_(.*)_(.*).csv", recursive=True)

[]

In [54]:
globs =  sorted(glob.glob("/Users/work/Development/GE_Data/NestedDirectory"+ "/**", recursive=True))
files = [f for f in globs if os.path.isfile(f)]
print(files)

['/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_1.csv', '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_2.csv', '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_3.csv', '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_1.csv', '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_2.csv', '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_3.csv', '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_1.csv', '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_2.csv', '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_3.csv']


In [55]:
globs =  sorted(glob.glob("/Users/work/Development/GE_Data/NestedDirectory"+ "/**", recursive=True))
files = [f for f in globs if os.path.isfile(f)]
print(files)

['/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_1.csv', '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_2.csv', '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_3.csv', '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_1.csv', '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_2.csv', '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_3.csv', '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_1.csv', '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_2.csv', '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_3.csv']


In [60]:
glob_config = {"partition_regex": r"(.*)/file_(.*)_(.*).csv"}

pattern = re.compile(glob_config["partition_regex"])

files_to_return = [file for file in files if pattern.match(file)]

files_to_return
# python 3.8
# pattern = re.compile('.*search.*')
# items = ['hello', 'searched', 'world', 'still', 'searching']
#if any((match := pattern.match(x)) for x in items):
#  print(match.group(0))
#lines_to_log = [line for line in output if r.match(line)]






['/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_1.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_2.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/100/file_2020_3.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_1.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_2.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/101/file_2020_3.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_1.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_2.csv',
 '/Users/work/Development/GE_Data/NestedDirectory/102/file_2020_3.csv']