In [None]:
# Azure Databricks config
app_id = "API_ID"
client_secret = "CLIENT_SECRET"
tenant = "TENANT"

configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": app_id,
       "fs.azure.account.oauth2.client.secret": client_secret,
       "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant}/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

In [None]:
# test that the mounting works, this is the storage container root
import os
os.listdir('/dbfs/mnt/data')

In [None]:
########################################################################################
# process kaggle dataset - select columns we are interested in and order the columns nicely
########################################################################################

In [None]:
kaggleDF = spark.read.options(header=True).csv('/mnt/data/dataset.csv')
kaggleDF.printSchema()
kaggleDF.show(truncate=False)

In [None]:
kaggleDF = bigDF.withColumnRenamed("cid", "id").filter("path LIKE './dataset/image/%'").select("id", "name", "path", "price")
kaggleDF.show()

In [None]:
# saves the DF back to disk (potential bottleneck!)
kaggleDF.toPandas().to_csv("/dbfs/mnt/data/dataset_clean.csv", sep=',', header=True, index=False)

In [None]:
########################################################################################
# retrieve images for a collection
########################################################################################

In [None]:
collectionDF = spark.read.json('/mnt/data/collections/mirlclub.json')
collectionDF.printSchema()
collectionDF.show()

In [None]:
from urllib.request import urlretrieve
def download_image(collection, name, url):
    urlretrieve(url, f"/dbfs/mnt/data/scraped_images/{collection}-{name}.jpg")

collectionDF.foreach(lambda row: download_image('mirlclub', row['name'], row['image']))

In [None]:
from pyspark.sql.functions import *
collectionDF = collectionDF.withColumn("id", concat(lit('mirlclub-'), col('id')))
collectionDF = collectionDF.withColumn("path", concat(lit('./scraped_images/mirlclub-'), col('name'), lit('.jpg')))
collectionDF = collectionDF.select('id', 'name', 'path', 'price')
collectionDF.show(truncate=False)

In [None]:
collectionDF.toPandas().to_csv("/dbfs/mnt/data/dataset_mirlclub.csv", sep=',', header=True, index=False)

In [None]:
collections = [
    'the-sandbox-shibuya-land-sale',
    'metroverse-genesis',
    'hapeprime',
    'collectvoxthewalkingdead',
    'decentral-games-ice',
    'jpeg-cards',
    'alphasharksofficial',
    'nyokies',
    'genesis-creepz',
    'quackyducksofficial',
    'capsulehouse',
    'coolmans-universe',
    'cyberkongz',
    'kaiju-kingz',
    'cryptoadz-by-gremplin',
    'tubby-cats',
    'deadfellaz',
    'metakages-official-collection',
    'psychedelics-anonymous-genesis',
    'town-star',
    'veecon-tickets',
    '10ktf-stockroom',
    'lvcidiaavatars',
    'neotokyo-citizens',
    'fidenza-by-tyler-hobbs',
    'g00p',
    'degentoonz-collection',
    'bossbeauties',
    'justcubesnft',
    'omni-mosquitoes-eth',
    'rtfkt-podx',
    'wolf-game',
    'thewalkingdeadofficialdarylmotorcycles',
    'bapetaverse-official',
    'mee6-avatars-pre-reveal',
    'rubberduckz',
    'nouns',
    'mv3-access-passes',
    'fluf-world',
    'chumchumsnft',
    'where-my-vans-go',
    'lazy-lions',
    'psychonautapedivision',
    'spaceridersnft',
    'bapetaverse-official',
    'the-squishiverse',
    'mirandus',
    'etherthings',
    'officialkenkyo',
    'felinefiendznft',
    'women-unite-10k-assemble',
    'chromie-squiggle-by-snowfro',
    'lilium',
    'pixel-interfaces',
    'mypethooligan',
    'shatteredeon-colonist',
    'frankfrank',
    'beari-collection',
    'crypto-unicorns-market',
    'max-pain-and-frens-by-xcopy',
    'pixel-vault-mintpass',
    'guttercatgang',
    'pjppfl',
    'fools-nft',
    'notbanksyrain',
    'metahero-generative',
    'nifty-tailor-genesis-mintpass',
    'llamaboost',
    'headtripz',
    'layer-zero-punks-eth',
    'pixels-farm',
    'dourdarcels',
    'cyberkongz-vx',
    'alphakongsclub',
    'mad-meerkat-burrow',
    'treeverse',
    'moonbirdpunks',
    'grayboys',
    'theshiboshis',
    'smiliesgenesis',
    'fvck-avatar-essence',
    'space-doodles-official',
    'pet-rock',
    'chain-runners-nft',
    'smallbros',
    'raidparty',
    'mirlclub',
    'ballies',
    'mad-hare-society-2',
    'mad-hare-society-1',
    'frenlypandas',
]


In [None]:
collections = [
    'lilium',
    'pixel-interfaces',
    'mypethooligan',
    'shatteredeon-colonist',
    'frankfrank',
    'beari-collection',
    'crypto-unicorns-market',
    'max-pain-and-frens-by-xcopy',
    'pixel-vault-mintpass',
    'guttercatgang',
    'pjppfl',
    'fools-nft',
    'notbanksyrain',
    'metahero-generative',
    'nifty-tailor-genesis-mintpass',
    'llamaboost',
    'headtripz',
    'layer-zero-punks-eth',
    'pixels-farm',
    'dourdarcels',
    'cyberkongz-vx',
    'alphakongsclub',
    'mad-meerkat-burrow',
    'treeverse',
    'moonbirdpunks',
    'grayboys',
    'theshiboshis',
    'smiliesgenesis',
    'fvck-avatar-essence',
    'space-doodles-official',
    'pet-rock',
    'chain-runners-nft',
    'smallbros',
    'raidparty',
    'mirlclub',
    'ballies',
    'mad-hare-society-2',
    'mad-hare-society-1',
    'frenlypandas',
]

import os
from urllib.request import urlretrieve
from pyspark.sql.functions import *

def download_image(slug, name, url):
    try:
        urlretrieve(url, f"/dbfs/mnt/data/scraped_images/{slug}/{slug}-#{name}.png")
    except:
        print('----> Retrieve {}-#{} failed'.format(slug, name))

for col in collections:
    print('--> Running for {}'.format(col))
    filePath = '/mnt/data/collections/{}.json'.format(col)
    collection_folder = "/dbfs/mnt/data/scraped_images/{}".format(col)
    if not os.path.exists(collection_folder):
        os.makedirs(collection_folder)
    try:
        colDf = spark.read.json(filePath)
        colDf.foreach(lambda row: download_image(col, row['id'], row['image']))
    except:
        print('--> {} failed because either {} not exist or download failed'.format(col,filePath))
#     colFactor = '{}-'.format(col)
#     collectionDF = colDf.withColumn("id", concat(lit(colFactor), col('id')))
#     collectionDF = collectionDF.withColumn("path", concat(lit('./scraped_images/{}-'.format(col)), col('name'), lit('.jpg')))
#     collectionDF = collectionDF.select('id', 'name', 'path', 'price')
#     collectionDF.show(truncate=False)