# Data Ingestion
In this notebook, I will request the 2021 to 2023 daily data for the following stocks and ETFs in the Telecommunications Sector.
* AT&T: T
* Verizon: VZ
* Comcast Corporation: CMCSA
* T-Mobile US: TMUS
* Charter Communications: CHTR
* Dow Jones U.S. Select Telecommunications Index: DJSTEL

In [0]:
# Imports
import requests
import time

# Importing needed Pyspark functions
from pyspark.sql import functions as F
from pyspark.sql import Row

# Importing Struct Types
from pyspark.sql.types import StructField, StructType, DoubleType, StringType, IntegerType

In [0]:
%run ./Config 

In [0]:
def get_data(ticker, size='compact', API_KEY = API_KEY):
    '''
    Function to get the stock daily historic data for a ticker from the Alpha Vantage API
    Inputs:
    * Ticker = Stock code: str
    * size = 'full' for 20 years of historic data or 'compact' for the last 100 data points: str
    
    Returns: 
    Data frame with the stock historic data
    '''
      
    # Get Data from Alpha Vantage Open API
    url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={ticker}&outputsize={size}&apikey={API_KEY}'
    r = requests.get(url)
    # Get only the json file output
    data = r.json()
    # Get only the time series, discarding the meta attribute
    dtf = data['Time Series (Daily)']

    # Transform JSON data into a list of Row objects
    rows = [Row(date=key, **{k: float(v) if k != '5. volume' else int(v) for k, v in value.items()}) for key, value in dtf.items()]
    
    # Define the schema for the DataFrame
    schema = StructType([
        StructField("date", StringType(), True),
        StructField("open", DoubleType(), True),
        StructField("high", DoubleType(), True),
        StructField("low", DoubleType(), True),
        StructField("close", DoubleType(), True),
        StructField("volume", IntegerType(), True)
    ])

    # Create a DataFrame
    df = spark.createDataFrame(rows, schema)

    return df


In [0]:
def get_economic_ind(indicator, API_KEY = API_KEY):
    '''
    Get Economic indicator data from Alpha Vantage API
    Indicator: str = e.g. 'REAL_GDP_PER_CAPITA', 'INFLATION'
    '''
    # Get Data from Alpha Vantage Open API
    url = f'https://www.alphavantage.co/query?function={indicator}&apikey={API_KEY}'
    r = requests.get(url)
    data = r.json()
        
    # Transform to dataframe
    df = spark.createDataFrame(data['data'])
    print(f'{indicator} data fetched')
    # Return
    return df

In [0]:
def save_data(df, ticker):
    '''Function to save a dataframe to Databricks File System in a given file path''' 
    
    #Error Handling
    try:
        # Create the path to save the file
        path = f'/FileStore/tables/raw/{ticker}'
        # Write file to DBFS
        df.coalesce(1).write.format('parquet').mode('overwrite').save(path) 
        # Return success message
        return f'{ticker} stock successfully saved to DBFS.'

    except:
        return f'Failed fetching {ticker} data.'


In [0]:
# Tickers to be fetched from the Alpha Vantage API
tickers = ['TMUS', 'T', 'VZ', 'CMCSA', 'CHTR']

# Extracting the Data from the API
for ticker in tickers:
    # Extract
    df_extracted = get_data(ticker=ticker, size='full')
    # Save
    save_data(df_extracted, ticker=ticker)
    time.sleep(2)
    

In [0]:
# Economic Indicators to be fetched from the Alpha Vantage API
indicators = ['REAL_GDP_PER_CAPITA', 'INFLATION']

# Extracting the Data from the API
for indicator in indicators:
    # Extract
    df_extracted = get_economic_ind(indicator=indicator)
    # Save
    save_data(df_extracted, ticker=indicator)
    time.sleep(4)
    

REAL_GDP_PER_CAPITA data fetched
INFLATION data fetched


In [0]:
# Tickers to be fetched from the Alpha Vantage API
telco_index = 'DJUSTL'
# Get Data from Alpha Vantage Open API
url = f'https://api.twelvedata.com/time_series?apikey={API_TWELVE}&interval=1day&symbol={telco_index}&format=JSON&outputsize=5000'
r = requests.get(url)
data = r.json()

# Create dataframe
df = spark.createDataFrame(data['values'])

#Save to the Raw Folder
path = f'/FileStore/tables/raw/{telco_index}'
# Write file to DBFS
df.coalesce(1).write.format('parquet').mode('overwrite').save(path)
   