In [1]:
from datetime import datetime
import os
import configparser
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [2]:
def create_spark_session(aws_access_key=None, aws_secret_key=None):
    conf = SparkConf()
    conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.0')
    conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider')
    if (aws_access_key is not None) & (aws_secret_key is not None): 
        conf.set('spark.hadoop.fs.s3a.access.key', aws_access_key)
        conf.set('spark.hadoop.fs.s3a.secret.key', aws_secret_key)
    
    spark = SparkSession \
        .builder \
        .config(conf=conf) \
        .getOrCreate()
    return spark

### Create Spark Session

In [3]:
spark = create_spark_session()

### Raw Data Frames

In [4]:
meta_path = './input_data/coin_meta_data.csv'
prices_path = './input_data/coin_price_data/'
gtrends_path = './input_data/google_trends_data/'


pd_raw_meta_df = pd.read_csv(meta_path)

raw_meta_df = spark.createDataFrame(pd_raw_meta_df.where(pd.notnull(pd_raw_meta_df), None))
raw_prices_df = spark.read.csv(prices_path, header=True)
raw_gtrends_df = spark.read.csv(gtrends_path, header=True)

raw_meta_shape = (raw_meta_df.count(), len(raw_meta_df.columns))
raw_prices_shape = (raw_prices_df.count(), len(raw_prices_df.columns))
raw_gtrends_shape = (raw_gtrends_df.count(), len(raw_gtrends_df.columns))

print(f'Raw Meta Data Schema - {raw_meta_shape}')
raw_meta_df.printSchema()
print(f'Raw Prices Schema - {raw_prices_shape}')
raw_prices_df.printSchema()
print(f'Raw Google Trends Schema - {raw_gtrends_shape}')
raw_gtrends_df.printSchema()

Raw Meta Data Schema - (52, 10)
root
 |-- id: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- name: string (nullable = true)
 |-- block_time_in_minutes: long (nullable = true)
 |-- hashing_algorithm: string (nullable = true)
 |-- genesis_date: string (nullable = true)
 |-- twitter_screen_name: string (nullable = true)
 |-- subreddit_url: string (nullable = true)
 |-- description: string (nullable = true)
 |-- github_url: string (nullable = true)

Raw Prices Schema - (786669, 5)
root
 |-- date: string (nullable = true)
 |-- price_usd: string (nullable = true)
 |-- mcap_usd: string (nullable = true)
 |-- volume_usd: string (nullable = true)
 |-- coin_id: string (nullable = true)

Raw Google Trends Schema - (141595, 5)
root
 |-- _c0: string (nullable = true)
 |-- date: string (nullable = true)
 |-- keyword_interest: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- coin_id: string (nullable = true)



## Final Data Model

In [7]:
coin_metrics_path = './output_data/coin_metrics'
coins_path = './output_data/coins'
google_trends_path = './output_data/google_trends'
time_path = './output_data/time'

coin_metrics_df = spark.read.parquet(coin_metrics_path, header=True)
coins_path_df = spark.read.parquet(coins_path, header=True)
google_trends_df = spark.read.parquet(google_trends_path, header=True)
time_df = spark.read.parquet(time_path, header=True)

coin_metrics_shape = (coin_metrics_df.count(), len(coin_metrics_df.columns))
coins_shape = (coins_path_df.count(), len(coins_path_df.columns))
google_trends_shape = (google_trends_df.count(), len(google_trends_df.columns))
time_shape = (time_df.count(), len(time_df.columns))

print(f'Coin Metrics Schema - {coin_metrics_shape}')
coin_metrics_df.printSchema()
print(f'Google Trends Schema - {google_trends_shape}')
google_trends_df.printSchema()
print(f'Coins Schema - {coins_shape}')
coins_path_df.printSchema()
print(f'Time Schema - {time_shape}')
time_df.printSchema()

Coin Metrics Schema - (786669, 6)
root
 |-- coin_id: string (nullable = true)
 |-- recorded_at: timestamp (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: double (nullable = true)
 |-- market_cap: double (nullable = true)
 |-- volume: double (nullable = true)

Google Trends Schema - (141190, 4)
root
 |-- coin_id: string (nullable = true)
 |-- recorded_at: timestamp (nullable = true)
 |-- keyword: string (nullable = true)
 |-- trend_value: integer (nullable = true)

Coins Schema - (52, 7)
root
 |-- coin_id: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- twitter_account: string (nullable = true)
 |-- subreddit_url: string (nullable = true)
 |-- github_url: string (nullable = true)

Time Schema - (791014, 7)
root
 |-- recorded_at: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-