# Utilities notebook

## Setting up functions

In [1]:
import os
from pyspark.sql import SparkSession
import requests
import pandas as pd
import numpy as np
from cassandra.cluster import Cluster


# Set pyspark env
os.environ["PYSPARK_PYTHON"] = "python"

spark = SparkSession.builder.appName('SparkCassandraApp').\
    config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.1').\
    config('spark.cassandra.connection.host', 'localhost').\
    config('spark.sql.extensions', 'com.datastax.spark.connector.CassandraSparkExtensions').\
    config('spark.sql.catalog.mycatalog', 'com.datastax.spark.connector.datasource.CassandraCatalog').\
    config('spark.cassandra.connection.port', '9042').getOrCreate()


cluster = Cluster(['localhost'], port=9042)
session = cluster.connect()
session.set_keyspace('compulsory')

In [59]:
# Set up request
def get_access_token():
    """Function to get access token from Barentswatch API

    Returns:
        str: Access token
    """
    url = "https://id.barentswatch.no/connect/token"
    # Read secret key from file
    secret_key = open(r'..\..\..\IND320\No_sync\fish_api', 'r').read()

    # Set up request to get access token
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "client_id": "erlend.risvik@gmail.com:fishclient",
        "scope": "api",
        "client_secret": secret_key,
        "grant_type": "client_credentials"
    }

    response = requests.post(url, headers=headers, data=data)
    return response.json()['access_token']

def convert_to_spark(df):
    """Function to convert pandas dataframe to spark dataframe

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to convert

    Returns:
    --------
    spark dataframe
    """
    return spark.createDataFrame(df)

def write_to_cassandra(df, table_name):
    """Function to write data to cassandra database

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to write
    table_name : str
        Name of table to write to
    """
    
    df_spark = convert_to_spark(df)
    (df_spark.write
     .format("org.apache.spark.sql.cassandra")
     .options(table=table_name, keyspace="compulsory")
     .mode("append")
     .save())

def check_exist_fish(year):
    """Function to check if data exists in database.'
    Parameters:
    -----------
    year : int
        Year of data

    Returns:
    --------
    bool: True if data exists, False if not
    """

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table = 'fish_data_full', keyspace="compulsory")
    .load()
    .createOrReplaceTempView('fish_data_full'))
    
    check = spark.sql(f"SELECT count(*) FROM fish_data_full WHERE year = {year}")   
    return check.collect()[0][0] >= 1 

def check_exist_lice(locality, year):
    """Function to check if data exists in database.'
    Parameters:
    -----------
    locality : int
        Locality number
    year : int
        Year of data

    Returns:
    --------
    bool: True if data exists, False if not
    """

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table = 'lice_data_full', keyspace="compulsory")
    .load()
    .createOrReplaceTempView('lice_data_full'))
    
    check = spark.sql(f"SELECT count(*) FROM lice_data_full WHERE year = {year} AND localityno = {locality}")   
    return check.collect()[0][0] >= 1 

def get_one_week_fish_data(year, week, access_token):
    """Function to get fish data from Barentswatch API.
    
    Parameters:
    -----------
    year : int
        Year of data
    week : int
        Week of data
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    json: json object with data
    """

    # Set url to correct API address
    url = f"https://www.barentswatch.no/bwapi/v1/geodata/fishhealth/locality/{year}/{week}"

    headers = {
        "Authorization": "Bearer "+ access_token}

    df = requests.get(url, headers = headers).json()
    return df

def get_one_year_fish_data(year, access_token):
    """Function to get all fish data from Barentswatch API limited to one year.

    Parameters:
    -----------
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    df: pandas dataframe with data
    """
    if check_exist_fish(year):
        return None

    # Set list of weeks (1-52).
    weeks = np.arange(1, 53)
    df = pd.DataFrame()
    for week in weeks:
        data = get_one_week_fish_data(year = year, week = week, access_token = access_token)["localities"]
        data = pd.DataFrame(data)
        data["year"] = year
        data["week"] = week
        df = pd.concat([df, data], ignore_index=True)

    df.columns = df.columns.str.lower()
    try:
        write_to_cassandra(df = df, table_name = "fish_data_full")
    except:
        return None

def get_one_week_lice_data(localty, year, week, access_token):
    """Function to get lice count data from Barentswatch API.

    Parameters:
    -----------
    localty : int
        Localty number
    year : int
        Year of data
    week : int
        Week of data
    access_token : str
        Access token from Barentswatch API

    Returns:
    --------
    json: json object with data 
    """

    # Set url to correct API address
    url = f'https://www.barentswatch.no/bwapi/v1/geodata/fishhealth/locality/{localty}/{year}/{week}'
    headers = {
        "Authorization": "Bearer "+ access_token}
    
    df = requests.get(url, headers=headers).json()
    return df

def get_one_year_lice_data(locality, year, access_token):
    """
    Function to get all lice count data from Barentswatch API limited to one year.

    Parameters:
    -----------
    localty : int
        Localty number
    year : int
        Year of data
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    df: pandas dataframe with data
    """

    if check_exist_lice(locality, year):
        return None

    # Set list of weeks (1-52).
    weeks = np.arange(1, 53)
    df = pd.DataFrame()
    for week in weeks:
        data = get_one_week_lice_data(localty = locality, year = year, week = week, access_token = access_token)["localityWeek"]
        for key, value in data.items():
            # Set to list to make it compatible to convert to pandas dataframe
            data[key] = [value]
        # Dropping columns that contain purely None and nested dictionaries
        data = pd.DataFrame(data).drop(columns = ["bathTreatments", "cleanerFish", "inFeedTreatments", \
                                                  "mechanicalRemoval", "timeSinceLastChitinSynthesisInhibitorTreatment"]) 
        data["year"] = year
        data["week"] = week
        df = pd.concat([df, data], ignore_index=True)
    # Lowercase column names
    df.columns = df.columns.str.lower()
    try:
        write_to_cassandra(df = df, table_name = "lice_data_full")
    except:
        return None

def clean_table(table_name):
    """Function to clean table in cassandra database

    Parameters:
    -----------
    table_name : str
        Name of table to clean
    """
    session.execute(f"TRUNCATE {table_name}")


access_token = get_access_token()

## Testing the functions

### Fish data

In [61]:
# Start by listing the empty table
table_name = 'fish_data_full'
clean_table(table_name)

(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

fish_data = spark.sql(f"select * from {table_name}").toPandas()
fish_data.head()

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year


In [62]:
# Next, we fetch data for 2015. We do it twice to see if it gets updated only once.
get_one_year_fish_data(year = 2015, access_token = access_token)
get_one_year_fish_data(year = 2015, access_token = access_token)

In [64]:
# Then we print the table again to see if it has been updated (only once)
(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

fish_data = spark.sql(f"select * from {table_name}").toPandas()
fish_data

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year
0,42462,0.06,False,False,False,True,True,True,False,True,...,False,False,64.092247,10398,10.115983,Åfjord,1630,Seiskjæra,33,2015
1,126624,,False,False,False,False,False,True,False,True,...,False,False,64.949883,12730,11.573300,Nærøy,1751,Båfjorden,50,2015
2,178057,0.25,False,False,False,False,True,True,False,True,...,False,False,61.079498,12179,5.809900,Høyanger,1416,Oslandsurda,28,2015
3,76858,0.03,False,False,False,True,True,True,False,True,...,False,False,61.876534,12224,5.722767,Bremanger,1438,Isane,10,2015
4,75282,0.00,False,False,False,False,True,True,False,True,...,False,False,62.348949,23695,6.910767,Stranda,1525,Skotungneset,51,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88659,187742,,False,False,False,False,False,True,False,True,...,False,False,64.750847,31357,11.189033,Nærøy,1751,Geitryggen,17,2015
88660,208755,,False,False,False,False,False,True,False,True,...,False,False,66.348732,32037,12.779417,Lurøy,1834,Grasholman,16,2015
88661,289605,,False,False,False,False,False,True,False,True,...,False,True,66.367302,10978,12.371417,Lurøy,1834,Naustholmen,50,2015
88662,817388,,False,False,False,False,False,False,False,True,...,False,False,70.601913,17057,26.954140,Lebesby,2022,Store Brattholmen,20,2015


88k rows checks out!

### Lice data

In [67]:
# Start by listing the empty table
table_name = 'lice_data_full'
clean_table(table_name)

(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

lice_data = spark.sql(f"select * from {table_name}").toPandas()
lice_data.head()

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year


In [68]:
# Next, we fetch data for 2015 and locality 24175. We do it twice to see if it gets updated only once.

get_one_year_lice_data(locality = 24175, year = 2017, access_token = access_token)
get_one_year_lice_data(locality = 24175, year = 2017, access_token = access_token)

In [69]:
# Then we print the table again to see if it has been updated (only once)
(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

lice_data = spark.sql(f"select * from {table_name}").toPandas()
lice_data

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
0,279982,0.0,0.0,0.0,False,False,False,False,True,True,False,False,24175,5.21,2,2017
1,543213,0.46,1.18,0.47,False,False,False,False,True,True,False,False,24175,4.84,52,2017
2,497754,0.02,0.05,0.0,False,False,False,False,True,True,False,False,24175,9.5,28,2017
3,466073,0.01,0.09,0.0,False,False,False,False,True,True,False,False,24175,3.1,12,2017
4,506967,0.05,0.09,0.3,False,False,False,False,True,True,False,False,24175,11.53,33,2017
5,514468,0.1,0.14,0.18,False,False,False,False,True,True,False,False,24175,10.56,37,2017
6,474017,0.06,0.06,0.06,False,False,False,False,True,True,False,False,24175,3.5,16,2017
7,459867,0.02,0.08,0.02,False,False,False,False,True,True,False,False,24175,3.4,9,2017
8,485426,0.02,0.02,0.02,False,False,False,False,True,True,False,False,24175,5.6,22,2017
9,483708,0.03,0.08,0.0,False,False,False,False,True,True,False,False,24175,5.6,21,2017


Checks out!