In [1]:
import os
from pyspark.sql import SparkSession
import requests
import pandas as pd
import numpy as np
from cassandra.cluster import Cluster


# Set pyspark env
os.environ["PYSPARK_PYTHON"] = "python"

spark = SparkSession.builder.appName('SparkCassandraApp').\
    config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.1').\
    config('spark.cassandra.connection.host', 'localhost').\
    config('spark.sql.extensions', 'com.datastax.spark.connector.CassandraSparkExtensions').\
    config('spark.sql.catalog.mycatalog', 'com.datastax.spark.connector.datasource.CassandraCatalog').\
    config('spark.cassandra.connection.port', '9042').getOrCreate()


cluster = Cluster(['localhost'], port=9042)
session = cluster.connect()
session.set_keyspace('compulsory')

In [2]:
# Set up request
def get_access_token():
    """Function to get access token from Barentswatch API

    Returns:
        str: Access token
    """
    url = "https://id.barentswatch.no/connect/token"
    # Read secret key from file
    secret_key = open(r'..\..\..\IND320\No_sync\fish_api', 'r').read()

    # Set up request to get access token
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "client_id": "erlend.risvik@gmail.com:fishclient",
        "scope": "api",
        "client_secret": secret_key,
        "grant_type": "client_credentials"
    }

    response = requests.post(url, headers=headers, data=data)
    return response.json()['access_token']

def convert_to_spark(df):
    """Function to convert pandas dataframe to spark dataframe

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to convert

    Returns:
    --------
    spark dataframe
    """
    return spark.createDataFrame(df)

def check_exist(table_name, year):
    """Function to check if data exists in database.'
    Parameters:
    -----------
    table_name : str
        Name of table to check
    year : int
        Year of data

    Returns:
    --------
    bool: True if data exists, False if not
    """

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table = table_name, keyspace="compulsory")
    .load()
    .createOrReplaceTempView(table_name))
    
    check = spark.sql(f"SELECT count(*) FROM {table_name} WHERE year = {year}")   
    return check.collect()[0][0] >= 1 


def get_one_week_fish_data(year, week, access_token):
    """Function to get fish data from Barentswatch API.
    
    Parameters:
    -----------
    year : int
        Year of data
    week : int
        Week of data
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    json: json object with data
    """

    # Set url to correct API address
    url = f"https://www.barentswatch.no/bwapi/v1/geodata/fishhealth/locality/{year}/{week}"

    headers = {
        "Authorization": "Bearer "+ access_token}

    df = requests.get(url, headers = headers).json()
    return df

def get_one_year_fish_data(year, access_token):
    """Function to get all fish data from Barentswatch API limited to one year.

    Parameters:
    -----------
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    df: pandas dataframe with data
    """
    if check_exist("fish_data_full", year):
        return None

    # Set list of weeks (1-52).
    weeks = np.arange(1, 53)
    df = pd.DataFrame()
    for week in weeks:
        data = get_one_week_fish_data(year = year, week = week, access_token = access_token)["localities"]
        data = pd.DataFrame(data)
        data["year"] = year
        data["week"] = week
        df = pd.concat([df, data], ignore_index=True)
    # Lowercase column names
    df.columns = df.columns.str.lower()
    write_to_cassandra(df = df, table_name = "fish_data_full")

def write_to_cassandra(df, table_name):
    """Function to write data to cassandra database

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to write
    table_name : str
        Name of table to write to
    """
    
    df_spark = convert_to_spark(df)
    (df_spark.write
     .format("org.apache.spark.sql.cassandra")
     .options(table=table_name, keyspace="compulsory")
     .mode("append")
     .save())
    
def clean_table(table_name):
    """Function to clean table in cassandra database

    Parameters:
    -----------
    table_name : str
        Name of table to clean
    """
    session.execute(f"TRUNCATE {table_name}")

In [3]:
access_token = get_access_token()

get_one_year_fish_data(year = 2015, access_token = access_token)

In [4]:
(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table="fish_data_full", keyspace="compulsory")
 .load()
 .createOrReplaceTempView("fish_data_full"))

fish_data = spark.sql("select * from fish_data_full").toPandas()
fish_data

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year
0,764221,,False,False,False,False,False,False,False,True,...,False,False,60.471451,18496,6.764933,Ullensvang,1231,Kaland,10,2015
1,73225,,False,False,False,False,False,True,False,True,...,False,False,63.661701,13888,9.136383,Hitra,1617,Osholmen,20,2016
2,92259,0.19,False,False,False,False,True,True,False,True,...,False,False,61.099983,23375,5.729967,Høyanger,1416,Sørevik,18,2016
3,270948,,False,False,False,True,False,True,False,True,...,False,False,59.608398,10300,5.311917,Sveio,1216,Hillersvik,46,2016
4,772948,,False,False,False,False,False,False,False,True,...,False,False,69.023087,23215,16.486900,Harstad,1903,Bornøy,13,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175805,749903,,False,False,False,False,False,False,False,True,...,False,False,63.578682,12809,9.962167,Rissa,1624,Straumen I Botnen,18,2015
175806,251226,,False,False,False,True,False,True,False,True,...,False,False,64.418533,26795,10.696033,Flatanger,1749,Austvika,32,2016
175807,776300,,False,False,False,False,False,False,False,True,...,False,False,63.190468,31057,8.578633,Aure,1576,Russholmen,25,2015
175808,734442,,False,False,False,False,False,False,False,True,...,False,False,60.516300,11741,4.917900,Øygarden,1259,Ternholmen Nø,14,2015


In [93]:
clean_table("fish_data_full")

In [94]:
fish_data = spark.sql("select * from fish_data_full").toPandas()
fish_data

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year
