### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
# Upgrade Oracle ADS to pick up latest features and maintain compatibility with Oracle Cloud Infrastructure.
# !pip install -U oracle-ads

## Preparando Instância Data Flow Studio

In [None]:
# importando a biblioteca ADS e realizando a autenticação
import ads

ads.set_auth("resource_principal")

In [None]:
# import os

# compartment_id = os.environ.get("NB_SESSION_COMPARTMENT_OCID")
# logs_bucket_uri = "oci://bucket-logs@id3kyspkytmr"
# archive_uri = "oci://bucket-library@id3kyspkytmr/archive3.zip"


In [None]:
import json

def prepare_command(command: dict) -> str:
    """Converts dictionary command to the string formatted commands."""
    return f"'{json.dumps(command)}'"

In [None]:
%load_ext dataflow.magics

In [None]:
%help

In [None]:
# command = prepare_command(
#     {
#         "compartmentId": compartment_id,
#         "displayName": "App_Demo_DataFlowStudio",
#         "language": "PYTHON",
#         "sparkVersion": "3.2.1",
#         "numExecutors": 4,
#         "archiveUri": archive_uri,
#         "driverShape": "VM.Standard.E4.Flex",
#         "executorShape": "VM.Standard.E4.Flex",
#         "driverShapeConfig": {"ocpus": 1, "memoryInGBs": 8},
#         "executorShapeConfig": {"ocpus": 1, "memoryInGBs": 8},
#         "logsBucketUri": logs_bucket_uri,
#         "type": "SESSION",
#         "logsBucketUri": logs_bucket_uri,
#         }
# ) 
# %create_session -l python -c $command

In [None]:
%use_session -s ocid1.dataflowapplication.oc1.sa-saopaulo-1.antxeljrofnirbya4kxzg7llt5xpbjbndrwqz4dtx32yhhdjxm3l7phgzspq

In [None]:
# %status

In [None]:
# %stop_session

In [None]:
import os
command = prepare_command(
    {
        "compartmentId": os.environ.get("NB_SESSION_COMPARTMENT_OCID"),
        "displayName": "App_Demo_DataFlowStudio",
        "applicationId": "ocid1.dataflowapplication.oc1.sa-saopaulo-1.antxeljrofnirbya4kxzg7llt5xpbjbndrwqz4dtx32yhhdjxm3l7phgzspq",
    }
)

%activate_session -l python -c $command

In [None]:
%configure_session -f -i \
'{"configuration": {\
    "spark.archives": "oci://conda-envs@grea08wzjvwv/conda_environments/cpu/PySpark 3.2 and Data Flow/2.0/pyspark32_p38_cpu_v2#conda" \
} \
}'

## Script Deltalake

In [None]:
%%spark
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from delta import *
from datetime import datetime

In [None]:
%%spark
builder = pyspark.sql.SparkSession.builder.appName("AppInLabDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
%%spark
df_nyc_tlc = spark.read.parquet("oci://bucket_tlc@grea08wzjvwv/2019/*.parquet", header=False, inferSchema=True)

In [None]:
%%spark
df_nyc_tlc.printSchema()

In [None]:
%%spark
df_nyc_tlc.show(2)

In [None]:
%%spark
df_nyc_tlc.select("tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count").write.format("delta").save("oci://raw-data@grea08wzjvwv/deltatable")

In [None]:
%%spark
read_delta = spark.read.format("delta").load("oci://raw-data@grea08wzjvwv/deltatable")
read_delta.printSchema()

# Delta Lake Features

## Schema Management - Schema Enforcement

In [None]:
%%spark
read_delta.printSchema()

In [None]:
%%spark
nschema = read_delta.schema

new_schema_deltaTable = spark.createDataFrame([(datetime.strptime('2023-02-01 00:47:37', '%Y-%m-%d %H:%M:%S'), datetime.strptime('2023-02-01 01:22:26', '%Y-%m-%d %H:%M:%S'), 1.0)], nschema).withColumn("passenger_count",expr("cast(passenger_count as Double)"))

new_schema_deltaTable.write.format("delta").mode("append").save("oci://raw-data@grea08wzjvwv/deltatable")

In [None]:
%%spark
new_schema_deltaTable.printSchema

In [None]:
%%spark
spark.read.format("delta").load("oci://raw-data@grea08wzjvwv/deltatable").printSchema()

## Schema Management - Schema Evolution

In [None]:
%%spark
nschema = read_delta.schema

se_deltaTable = spark.createDataFrame([(datetime.strptime('2023-02-01 00:47:37', '%Y-%m-%d %H:%M:%S'), datetime.strptime('2023-02-01 01:22:26', '%Y-%m-%d %H:%M:%S'), 1.0)], nschema).withColumn("pickup_location_id", lit("45"))

se_deltaTable.write.format("delta").option("mergeSchema", "true").mode("append").save("oci://raw-data@grea08wzjvwv/deltatable")

In [None]:
%%spark
spark.read.format("delta").load("oci://raw-data@grea08wzjvwv/deltatable").show()

In [None]:
%%spark
spark.read.format("delta").load("oci://raw-data@grea08wzjvwv/deltatable").where("pickup_location_id = 45").show()