
---

## Do an analysis of the data BEFORE loading it into CosmosDB

### - Read a raw data CSV file from Azure Blob Storage
### - Explore it with Synapse
### - Identify good potential CosmosDB Partition Keys (high cardinality, well distributed)
### - Identify poor potential CosmosDB Partition Keys (low cardinality, skewed distribution)

.

---

.

.


In [1]:
%%pyspark
blob_account_name = "cjoakimstorage"
blob_container_name = "demo22"
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary
blob_sas_token = token_library.getConnectionString("cjoakimstorageAzureBlobStorage")

spark.conf.set(
    'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
    blob_sas_token)

blob_url = 'wasbs://demo22@cjoakimstorage.blob.core.windows.net/air_travel_departures.csv'

df = spark.read.load(blob_url, format='csv', header=True, sep='|')

# "push down predicate" example
#df = spark.read.load(blob_url, format='csv', header=True, sep='|').filter(col("from_iata") == "CLT")

display(df.limit(8))


StatementMeta(, , , SessionError, )

AVAILABLE_COMPUTE_CAPACITY_EXCEEDED: Livy session has failed. Session state: Error. Error code: AVAILABLE_COMPUTE_CAPACITY_EXCEEDED. Your job requested 12 vcores. However, the pool only has 0 vcores available out of quota of 12 vcores. Try ending the running job(s) in the pool, reducing the numbers of vcores requested, increasing the pool maximum size or using another pool. Source: User.

## Display the observed structure of the data

In [None]:
df.printSchema()

StatementMeta(, , , Cancelled, )

## Display the Row and Column Counts

In [None]:
print((df.count(), len(df.columns)))

StatementMeta(, , , Cancelled, )


## Explore the **airlineid** attribute as a potential CosmosDB Partition Key

In [None]:
attr_name = 'airlineid'
df.select(attr_name).distinct().count()

StatementMeta(, , , Cancelled, )

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

StatementMeta(, , , Cancelled, )

## Explore the **from_iata** attribute as a potential CosmosDB Partition Key

In [None]:
attr_name = 'from_iata'
df.select(attr_name).distinct().count()

StatementMeta(, , , Cancelled, )

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

StatementMeta(, , , Cancelled, )

## Explore the **to_iata** attribute as a potential CosmosDB Partition Key


In [None]:
attr_name = 'to_iata'
df.select(attr_name).distinct().count()

StatementMeta(, , , Cancelled, )

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

StatementMeta(, , , Cancelled, )

## Explore the **to_airport_country** attribute as a potential CosmosDB Partition Key 

In [None]:
attr_name = 'to_airport_country'
df.select(attr_name).distinct().count()

StatementMeta(, , , Cancelled, )

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

StatementMeta(, , , Cancelled, )

 ## Explore the **route** attribute as a potential CosmosDB Partition Key

In [None]:
attr_name = 'route'
df.select(attr_name).distinct().count()

StatementMeta(, , , Cancelled, )

In [None]:
from pyspark.sql.functions import desc

display(df.groupBy(attr_name).count().sort(desc("count")))

StatementMeta(, , , Cancelled, )