### First make sure your atlas data lake configuration is set. I have mine to use S3 and Atlas as sources and databases:

```json
{
  "databases": [
    {
      "name": "esteininger-personal-datalake-s3",
      "collections": [
        {
          "name": "*",
          "dataSources": [
            {
              "path": "{collectionName()}",
              "storeName": "esteininger-personal-datalake"
            }
          ]
        }
      ],
      "views": []
    },
    {
      "name": "esteininger-personal-datalake-atlas",
      "collections": [
        {
          "name": "*",
          "dataSources": [
            {
              "database": "analytics",
              "storeName": "FINRA-DXT"
            }
          ]
        }
      ],
      "views": []
    }
  ],
  "stores": [
    {
      "provider": "s3",
      "bucket": "esteininger-personal-datalake",
      "delimiter": "/",
      "includeTags": false,
      "name": "esteininger-personal-datalake",
      "region": "us-east-2"
    },
    {
      "provider": "atlas",
      "clusterName": "FINRA-DXT",
      "name": "FINRA-DXT",
      "projectId": "5e382c949ccf640b0d48ec82"
    }
  ]
}

```


In [79]:
from datetime import datetime
import ssl
from config import datalake_url
from pymongo import MongoClient
from pprint import pprint

# Create connection objects

In [None]:
dl_atlas_conn = MongoClient(datalake_url, ssl_cert_reqs=ssl.CERT_NONE)['esteininger-personal-datalake-atlas']
dl_s3_conn = MongoClient(datalake_url, ssl_cert_reqs=ssl.CERT_NONE)['esteininger-personal-datalake-s3']

# Out to S3

In [None]:
# splitting collection in half
pipeline = [
    {
        '$match': {
            'inc': {
                '$lt': 25
            }
        }
    },
    {
        '$out': {
            's3': {
                'bucket': 'esteininger-personal-datalake',
                'filename': 'analytics',
                "region": "us-east-2",
                'format': {
                    'name': 'json'
                }
            }
        }
    }
]

c = dl_atlas_conn.clickstream.aggregate(pipeline)
pprint(list(c))
pprint('Archive created!')


# Create the schema

In [None]:
# optionally, can exclude some key value pairs (defaults to taking a sample size of 1000 docs)

s3_schema = dl_s3_conn.command({'sqlGenerateSchema': 1, 'setSchemas': True})
atlas_schema = dl_atlas_conn.command({'sqlGenerateSchema': 1, 'setSchemas': True})

print("s3 sql schema", s3_schema)
print("atlas sql schema", atlas_schema)

# Show the Schema was created

In [None]:
atlas_clickstream = dl_atlas_conn.command({'sqlGetSchema': "clickstream"})

s3_analytics = dl_atlas_conn.command({'sqlGetSchema': "clickstream"})

print(atlas_clickstream)
print(s3_analytics)

# Query MongoDB cluster

In [None]:
sql = "select * from `clickstream` limit 2"

pipeline = [
    {
        '$sql': {
            'statement': sql,
            'format': "jdbc",
            'dialect': "mysql",
        }
    }
]

a_sql_q = dl_atlas_conn.aggregate(pipeline)
pprint(list(a_sql_q))

# Query S3 via Atlas Data Lake

In [None]:
sql = "select * from `analytics.1` limit 2"

pipeline = [
    {
        '$sql': {
            'statement': sql,
            'format': "jdbc",
            'dialect': "mysql",
        }
    }
]

r = dl_s3_conn.aggregate(pipeline)
pprint(list(r))