
---

## Do an analysis of the data BEFORE loading it into CosmosDB

### - Read a raw data CSV file from Azure Blob Storage
### - Explore it with Synapse
### - Identify good potential CosmosDB Partition Keys (high cardinality, well distributed)
### - Identify poor potential CosmosDB Partition Keys (low cardinality, skewed distribution)

.

---

.

.


In [None]:
%%pyspark
blob_account_name = "cjoakimstorage22"
blob_container_name = "raw"
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary
blob_sas_token = token_library.getConnectionString("AzureBlobStorage_cjoakimstorage22")

spark.conf.set(
    'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
    blob_sas_token)

blob_url = 'wasbs://raw@cjoakimstorage22.blob.core.windows.net/air_travel_departures.csv'

df = spark.read.load(blob_url, format='csv', header=True, sep='|')
display(df.limit(8))


## Display the observed structure of the data

In [None]:
df.printSchema()

## Display the Row and Column Counts

In [None]:
print((df.count(), len(df.columns)))


## Explore the **airlineid** attribute as a potential CosmosDB Partition Key

In [None]:
attr_name = 'airlineid'
df.select(attr_name).distinct().count()

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

## Explore the **from_iata** attribute as a potential CosmosDB Partition Key

In [None]:
attr_name = 'from_iata'
df.select(attr_name).distinct().count()

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

## Explore the **to_iata** attribute as a potential CosmosDB Partition Key


In [None]:
attr_name = 'to_iata'
df.select(attr_name).distinct().count()

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

## Explore the **to_airport_country** attribute as a potential CosmosDB Partition Key 

In [None]:
attr_name = 'to_airport_country'
df.select(attr_name).distinct().count()

In [None]:
from pyspark.sql.functions import desc
display(df.groupBy(attr_name).count().sort(desc("count")))

 ## Explore the **route** attribute as a potential CosmosDB Partition Key

In [None]:
attr_name = 'route'
df.select(attr_name).distinct().count()

In [None]:
from pyspark.sql.functions import desc

display(df.groupBy(attr_name).count().sort(desc("count")))