# Utilities notebook

## Setting up functions

In [75]:
import os
from pyspark.sql import SparkSession
import requests
import pandas as pd
import numpy as np
from cassandra.cluster import Cluster
from exceptions import *


# Set pyspark env

os.environ["PYSPARK_PYTHON"] = "python"

spark = SparkSession.builder.appName('SparkCassandraApp').\
    config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.1').\
    config('spark.cassandra.connection.host', 'localhost').\
    config('spark.sql.extensions', 'com.datastax.spark.connector.CassandraSparkExtensions').\
    config('spark.sql.catalog.mycatalog', 'com.datastax.spark.connector.datasource.CassandraCatalog').\
    config('spark.cassandra.connection.port', '9042').getOrCreate()


cluster = Cluster(['localhost'], port=9042)
session = cluster.connect()
session.set_keyspace('compulsory')

In [2]:
# Set up request
def get_access_token():
    """Function to get access token from Barentswatch API

    Returns:
        str: Access token
    """
    url = "https://id.barentswatch.no/connect/token"
    # Read secret key from file
    secret_key = open(r'..\..\..\IND320\No_sync\fish_api', 'r').read()

    # Set up request to get access token
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "client_id": "erlend.risvik@gmail.com:fishclient",
        "scope": "api",
        "client_secret": secret_key,
        "grant_type": "client_credentials"
    }

    response = requests.post(url, headers=headers, data=data)
    return response.json()['access_token']

def convert_to_spark(df):
    """Function to convert pandas dataframe to spark dataframe

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to convert

    Returns:
    --------
    spark dataframe
    """
    return spark.createDataFrame(df)

def write_to_cassandra(df, table_name):
    """Function to write data to cassandra database

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to write
    table_name : str
        Name of table to write to
    """
    
    df_spark = convert_to_spark(df)
    (df_spark.write
     .format("org.apache.spark.sql.cassandra")
     .options(table=table_name, keyspace="compulsory")
     .mode("append")
     .save())

def check_exist_fish(year):
    """Function to check if data exists in database.'
    Parameters:
    -----------
    year : int
        Year of data

    Returns:
    --------
    bool: True if data exists, False if not
    """

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table = 'fish_data_full', keyspace="compulsory")
    .load()
    .createOrReplaceTempView('fish_data_full'))
    
    check = spark.sql(f"SELECT count(*) FROM fish_data_full WHERE year = {year}")   
    return check.collect()[0][0] >= 1 

def check_exist_lice(locality, year):
    """Function to check if data exists in database.'
    Parameters:
    -----------
    locality : int
        Locality number
    year : int
        Year of data

    Returns:
    --------
    bool: True if data exists, False if not
    """

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table = 'lice_data_full', keyspace="compulsory")
    .load()
    .createOrReplaceTempView('lice_data_full'))
    
    check = spark.sql(f"SELECT count(*) FROM lice_data_full WHERE year = {year} AND localityno = {locality}")   
    return check.collect()[0][0] >= 1 

def get_one_week_fish_data(year, week, access_token):
    """Function to get fish data from Barentswatch API.
    
    Parameters:
    -----------
    year : int
        Year of data
    week : int
        Week of data
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    json: json object with data
    """

    # Set url to correct API address
    url = f"https://www.barentswatch.no/bwapi/v1/geodata/fishhealth/locality/{year}/{week}"

    headers = {
        "Authorization": "Bearer "+ access_token}

    df = requests.get(url, headers = headers).json()
    return df

def get_one_year_fish_data(year, access_token):
    """Function to get all fish data from Barentswatch API limited to one year.

    Parameters:
    -----------
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    df: pandas dataframe with data
    """
    if check_exist_fish(year):
        return None

    # Set list of weeks (1-52).
    weeks = np.arange(1, 53)
    df = pd.DataFrame()
    for week in weeks:
        data = get_one_week_fish_data(year = year, week = week, access_token = access_token)["localities"]
        data = pd.DataFrame(data)
        data["year"] = year
        data["week"] = week
        df = pd.concat([df, data], ignore_index=True)

    df.columns = df.columns.str.lower()
    try:
        write_to_cassandra(df = df, table_name = "fish_data_full")
    except:
        return None

def get_one_week_lice_data(localty, year, week, access_token):
    """Function to get lice count data from Barentswatch API.

    Parameters:
    -----------
    localty : int
        Localty number
    year : int
        Year of data
    week : int
        Week of data
    access_token : str
        Access token from Barentswatch API

    Returns:
    --------
    json: json object with data 
    """

    # Set url to correct API address
    url = f'https://www.barentswatch.no/bwapi/v1/geodata/fishhealth/locality/{localty}/{year}/{week}'
    headers = {
        "Authorization": "Bearer "+ access_token}
    
    df = requests.get(url, headers=headers).json()
    return df

def get_one_year_lice_data(locality, year, access_token):
    """
    Function to get all lice count data from Barentswatch API limited to one year.

    Parameters:
    -----------
    localty : int
        Localty number
    year : int
        Year of data
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    df: pandas dataframe with data
    """

    if check_exist_lice(locality, year):
        return None

    # Set list of weeks (1-52).
    weeks = np.arange(1, 53)
    df = pd.DataFrame()
    for week in weeks:
        data = get_one_week_lice_data(localty = locality, year = year, week = week, access_token = access_token)["localityWeek"]
        for key, value in data.items():
            # Set to list to make it compatible to convert to pandas dataframe
            data[key] = [value]
        # Dropping columns that contain purely None and nested dictionaries
        data = pd.DataFrame(data).drop(columns = ["bathTreatments", "cleanerFish", "inFeedTreatments", \
                                                  "mechanicalRemoval", "timeSinceLastChitinSynthesisInhibitorTreatment"]) 
        data["year"] = year
        data["week"] = week
        df = pd.concat([df, data], ignore_index=True)
    # Lowercase column names
    df.columns = df.columns.str.lower()
    try:
        write_to_cassandra(df = df, table_name = "lice_data_full")
    except:
        return None

def clean_table(table_name):
    """Function to clean table in cassandra database

    Parameters:
    -----------
    table_name : str
        Name of table to clean
    """
    session.execute(f"TRUNCATE {table_name}")


def get_df(table_name):

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table=table_name, keyspace="compulsory")
    .load()
    .createOrReplaceTempView(table_name))

    df = spark.sql(f"select * from {table_name}").toPandas()
    df = df.sort_values(by=['week'])
    # quick fix to fix datatype.
    if table_name == 'fish_data_full':
        df['lat'] = df['lat'].astype(np.float64)
        df['lon'] = df['lon'].astype(np.float64)

    return df

access_token = get_access_token()

In [None]:
clean_table('lice_data_full')

## Testing the functions

### Fish data

In [71]:
# Start by listing the empty table
table_name = 'fish_data_full'
clean_table(table_name)

(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

fish_data = spark.sql(f"select * from {table_name}").toPandas()
fish_data.head()

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year


In [72]:
# Next, we fetch data for 2015. We do it twice to see if it gets updated only once.
get_one_year_fish_data(year = 2015, access_token = access_token)
get_one_year_fish_data(year = 2015, access_token = access_token)

In [73]:
# Then we print the table again to see if it has been updated (only once)
(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

fish_data = spark.sql(f"select * from {table_name}").toPandas()
fish_data

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year
0,764221,,False,False,False,False,False,False,False,True,...,False,False,60.471451,18496,6.764933,Ullensvang,1231,Kaland,10,2015
1,88187,0.32,False,False,False,True,True,True,False,True,...,False,False,60.407501,10338,6.356717,Kvam,1238,Djupevik,20,2015
2,339587,,False,False,False,False,False,True,False,True,...,False,False,61.212399,12158,7.093267,Sogndal,1420,Skjersnes,24,2015
3,43161,0.00,False,False,False,False,True,True,False,True,...,False,False,59.832218,28096,5.984267,Kvinnherad,1224,Slåttenes,39,2015
4,872461,,False,False,False,False,False,False,False,True,...,False,False,63.954182,10248,10.085183,Åfjord,1630,Eidskjæra Nø,51,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88659,5943,,False,False,False,False,False,True,True,True,...,False,False,61.297916,32317,4.660450,Askvoll,1428,Guriøyna,4,2015
88660,31804,,False,False,False,True,False,True,False,True,...,False,False,63.822449,33557,8.740750,Frøya,1620,Langskjæra II,1,2015
88661,799676,,False,False,False,False,False,False,False,True,...,False,False,62.928532,28776,7.580517,Eide,1551,Krekvikbogen,27,2015
88662,300508,,False,False,False,False,False,True,False,True,...,False,False,63.462101,31257,7.856233,Smøla,1573,Hjortholman 2,47,2015


88k rows checks out!

### Lice data

In [134]:
# Start by listing the empty table
table_name = 'lice_data_full'
clean_table(table_name)

(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

lice_data = spark.sql(f"select * from {table_name}").toPandas()
lice_data.head()

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year


In [75]:
# Next, we fetch data for 2015 and locality 24175. We do it twice to see if it gets updated only once.

get_one_year_lice_data(locality = 24175, year = 2017, access_token = access_token)
get_one_year_lice_data(locality = 24175, year = 2017, access_token = access_token)

In [135]:
# Then we print the table again to see if it has been updated (only once)
(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table=table_name, keyspace="compulsory")
 .load()
 .createOrReplaceTempView(table_name))

lice_data = spark.sql(f"select * from {table_name}").toPandas()
lice_data

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year


Checks out!

## Weather data

In [3]:
import ast

SECRET_INFO = open("../../NO_SYNC/weather_api", 'r').read().replace('\n', '')
SECRET_ID = ast.literal_eval(SECRET_INFO)["client_id"]

In [4]:
os.environ["PYSPARK_PYTHON"] = "python"

spark = SparkSession.builder.appName('SparkCassandraApp').\
    config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.1').\
    config('spark.cassandra.connection.host', 'localhost').\
    config('spark.sql.extensions', 'com.datastax.spark.connector.CassandraSparkExtensions').\
    config('spark.sql.catalog.mycatalog', 'com.datastax.spark.connector.datasource.CassandraCatalog').\
    config('spark.cassandra.connection.port', '9042').getOrCreate()

(spark.read.format("org.apache.spark.sql.cassandra")
 .options(table="fish_data_full", keyspace="compulsory")
 .load()
 .createOrReplaceTempView("fish_data_full"))

In [136]:
lice_data = get_df('lice_data_full')
fish_data = get_df('fish_data_full')

In [137]:
lice_data

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
26,1343273,0.0,0.08,0.04,False,False,False,True,True,True,False,False,12224,7.94,1,2022
6,1346842,0.03,0.13,0.1,False,False,False,False,True,True,False,False,12224,7.91,2,2022
8,1349367,0.18,1.38,0.05,False,False,False,False,True,True,False,False,12224,7.43,3,2022
2,1352417,0.25,1.35,0.12,False,False,False,False,True,True,False,False,12224,6.89,4,2022
25,1354943,0.3,2.32,0.0,False,False,False,False,True,True,False,False,12224,6.2,5,2022
47,1357469,0.3,2.92,0.2,False,False,False,False,True,True,False,False,12224,6.0,6,2022
41,1361579,0.43,2.42,0.12,False,False,False,False,True,True,False,False,12224,5.23,7,2022
15,1365695,0.02,0.08,0.08,False,False,False,False,True,True,False,False,12224,5.74,8,2022
50,1369285,0.03,0.55,0.22,False,False,False,False,True,True,False,False,12224,5.74,9,2022
11,1372349,0.1,1.35,0.1,False,False,False,False,True,True,False,False,12224,6.19,10,2022


In [80]:
def get_cords(df, localityno):
    """Function to get coordinates from dataframe

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to get coordinates from
    localityno : int
        Locality number to get coordinates from

    Returns:
    --------
    list: list of tuples with coordinates
    """
    # there are multiple rows with the same localityno, so we need to get the first one
    subset = df[df["localityno"] == localityno].iloc[0:1]

    return float(subset["lat"]), float(subset["lon"])

def get_nearest_stations(lat, lon):
    """Function to get nearest weather station from frost.met.no

    Parameters:
    -----------
    lat : float
        Latitude
    lon : float
        Longitude

    Returns:
    --------
    json: json object with data
    """
    # Set up parameters

    endpoint = 'https://frost.met.no/sources/v0.jsonld'
    parameters = {
    "geometry" : f"nearest(POINT({lon} {lat}))",
    "nearestmaxcount": 20,
    }

    # Issue an HTTP GET request
    r = requests.get(endpoint, parameters, auth=(SECRET_ID,''))
    # Extract JSON data
    json = r.json()

    # Check if the request worked, print out any errors
    if r.status_code == 200:
        data = json['data']
        # extract the list of source ids and distance as a tuple
        data = [(d['id'], d['distance']) for d in data]
        return data
    else:
        raise FetchDataError(f"Request failed with status code {r.status_code}")
    
def get_daily_data(df, localityno, year):
    """Function to get daily weather data from frost.met.no

    Parameters:
    -----------
    df : pandas dataframe
        Dataframe to get coordinates from
    localityno : int
        Locality number
    year : int
        Year of data

    Returns:
    --------
    df3: pandas dataframe with data
    """ 
    
    lat, lon = get_cords(df = df, localityno = localityno)
    stations = get_nearest_stations(lat, lon)
    ids = [d[0] for d in stations]
    distances = [d[1] for d in stations]
   
    endpoint = 'https://frost.met.no/observations/v0.jsonld'

    for idx, id in enumerate(ids):
        parameters = {
            'sources': id,
            'elements': 'sum(precipitation_amount P1D), mean(air_temperature P1D), mean(wind_speed P1D), mean(relative_humidity P1D)',
            'referencetime': f"{year}-01-01/{year}-12-31",
            'levels' : 'default',
            'timeoffsets': 'default'
        }

    # Issue an HTTP GET request
        r = requests.get(endpoint, parameters, auth=(SECRET_ID,''))
        # Extract JSON data
        json = r.json()

        df = pd.DataFrame()
        try: 
            data = json['data']
            for i in range(len(data)):
                row = pd.DataFrame(data[i]['observations'])
                row['referenceTime'] = data[i]['referenceTime']
                row['sourceId'] = data[i]['sourceId']
                df = pd.concat([df, row], ignore_index=True)

            df = df.reset_index(drop=True)

            columns = ['sourceId','referenceTime','elementId','value','unit','timeOffset']
            df2 = df[columns].copy()
            df2['referenceTime'] = pd.to_datetime(df2['referenceTime']).dt.strftime('%Y-%m-%d')
            
            df3 = df2.pivot(index='referenceTime', columns='elementId', values='value').reset_index()      
            df3.columns = ['date', 'temperature', 'humidity', 'wind_speed', 'precipitation']
        except:
            if idx == len(ids)-1 or distances[idx]>50:
                raise NoDataError("No data available")
            continue
        
        # add the distance as a column
        df3['distance'] = distances[idx]
        df3['localityno'] = localityno
        return df3
    
weather_data = get_daily_data(df = fish_data, localityno = 23695, year = 2015)
weather_data.head()

Unnamed: 0,date,temperature,humidity,wind_speed,precipitation,distance,localityno
0,2015-01-01,1.3,92.0,12.8,0.5,18.074864,23695
1,2015-01-02,-2.4,98.0,3.4,7.9,18.074864,23695
2,2015-01-03,-4.6,97.0,1.8,34.1,18.074864,23695
3,2015-01-04,-5.4,97.0,2.1,52.5,18.074864,23695
4,2015-01-05,-0.7,98.0,6.4,8.5,18.074864,23695


In [86]:
def convert_to_weekly_data(weather_data):
    weather_data['date'] = pd.to_datetime(weather_data['date'])
    weather_data['week'] = weather_data['date'].dt.isocalendar().week
    weather_data['year'] = weather_data['date'].dt.isocalendar().year

    # create the weekly_weather_data_mean DataFrame where we aggregate by weekly means
    weekly_weather_data_mean = pd.DataFrame()
    weekly_weather_data_mean['week'] = weather_data['week']
    weekly_weather_data_mean['humidity'] = weather_data['humidity']
    weekly_weather_data_mean['temperature'] = weather_data['temperature']
    weekly_weather_data_mean['wind_speed'] = weather_data['wind_speed']
    weekly_weather_data_mean = weekly_weather_data_mean.groupby('week').mean()

    # same for precipitation, but we use weekly sum
    weekly_weather_data_sum = pd.DataFrame()
    weekly_weather_data_sum['week'] = weather_data['week']
    weekly_weather_data_sum['precipitation'] = weather_data['precipitation']
    weekly_weather_data_sum = weekly_weather_data_sum.groupby('week').sum()

    # merging the two dataframes
    weekly_weather_data = pd.merge(weekly_weather_data_mean, weekly_weather_data_sum, left_index=True, right_index=True)

    # add the year, week and localityno columns
    weekly_weather_data['year'] = weather_data['year'][0]
    weekly_weather_data['week'] = weekly_weather_data.index
    weekly_weather_data['localityno'] = weather_data['localityno'][0]
    weekly_weather_data = weekly_weather_data.reset_index(drop=True)

    # create a id column that is the concatenation of year_week_localityno
    weekly_weather_data['id'] = weekly_weather_data['year'].astype(str) + '_' + weekly_weather_data['week'].astype(str) + '_' + weekly_weather_data['localityno'].astype(str)

    return weekly_weather_data

weekly_weather_data = convert_to_weekly_data(weather_data)
weekly_weather_data.head()

Unnamed: 0,humidity,temperature,wind_speed,precipitation,year,week,localityno,id
0,96.0,-2.775,5.025,95.0,2015,1,23695,2015_1_23695
1,93.142857,-1.771429,7.042857,34.9,2015,2,23695,2015_2_23695
2,91.428571,-2.828571,8.614286,35.5,2015,3,23695,2015_3_23695
3,72.285714,-4.228571,6.7,6.6,2015,4,23695,2015_4_23695
4,82.0,-2.957143,5.942857,8.4,2015,5,23695,2015_5_23695


In [87]:
# setup cassandra table

weekly_weather_spark = convert_to_spark(weekly_weather_data)

cassandra_types = {
    'temperature': 'float',
    'humidity': 'float',
    'wind_speed': 'float',
    'precipitation': 'float',
    'year': 'int',
    'week': 'int',
    'localityno': 'int'
}

table_name = "weekly_weather_data"
cql_create_table = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id text PRIMARY KEY,
        {', '.join([f"{i} {cassandra_types[i]}" for i in weekly_weather_spark.columns if i != "id"])}
    );
""".replace("\n", " ")

session.execute(f"DROP TABLE IF EXISTS compulsory.{table_name};")
session.execute(cql_create_table)

<cassandra.cluster.ResultSet at 0x2db2c058a10>

In [88]:
(weekly_weather_spark
.write
.format("org.apache.spark.sql.cassandra")
.options(table=table_name, keyspace="compulsory")
.mode("append")
.save())

In [141]:
table_name = "weekly_weather_data"

(spark
.read
.format("org.apache.spark.sql.cassandra")
.options(table=table_name, keyspace="compulsory")
.load()
.createOrReplaceTempView(table_name))

weekly_weather_spark = spark.sql(f"select * from {table_name}").toPandas()
weekly_weather_spark

Unnamed: 0,id,humidity,localityno,precipitation,temperature,week,wind_speed,year
0,2015_27_45157,85.714287,45157,19.299999,13.828571,27,3.400000,2015
1,2015_3_10265,78.714287,10265,12.200000,-5.271429,3,2.871428,2015
2,2015_5_10265,80.142860,10265,31.000000,-0.528571,5,2.757143,2015
3,2021_27_12224,80.428574,12224,22.299999,11.971429,27,1.828571,2021
4,2015_40_12837,78.714287,12837,0.100000,6.757143,40,1.714286,2015
...,...,...,...,...,...,...,...,...
418,2015_24_10265,75.571426,10265,33.500000,7.185714,24,3.457143,2015
419,2015_21_10265,68.571426,10265,17.200001,6.971428,21,2.885714,2015
420,2015_15_12562,65.000000,12562,9.500000,1.885714,15,5.000000,2015
421,2015_53_45157,60.000000,45157,0.000000,3.133333,53,12.333333,2015


In [139]:
weekly_weather_spark

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
0,1476891,0.44,5.46,0.72,False,False,True,False,True,True,False,False,12224,9.99,45,2022
1,1428977,,,,False,False,False,False,False,True,True,False,12224,,28,2022
2,1466961,0.0,0.27,0.46,False,False,False,False,True,True,False,False,12224,11.1,42,2022
3,1361579,0.43,2.42,0.12,False,False,False,False,True,True,False,False,12224,5.23,7,2022
4,1453065,,,,False,False,False,False,False,True,True,False,12224,,37,2022
5,1374883,0.03,1.62,0.12,False,False,False,False,True,True,False,False,12224,7.54,11,2022
6,1409821,,,,False,False,False,False,False,True,True,False,12224,,22,2022
7,1365695,0.02,0.08,0.08,False,False,False,False,True,True,False,False,12224,5.74,8,2022
8,1440294,,,,False,False,False,False,False,True,True,False,12224,,32,2022
9,1511577,0.23,1.08,0.59,False,False,False,False,True,True,False,False,12224,7.79,52,2022


In [116]:
def check_exist_weather(locality, year):
    """Function to check if data exists in database.'
    
    Parameters:
    -----------
    locality : int
        Locality number
    year : int
        Year of data

    Returns:
    --------
    bool: True if data exists, False if not
    """

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table = 'weekly_weather_data', keyspace="compulsory")
    .load()
    .createOrReplaceTempView('weekly_weather_data'))
    
    check = spark.sql(f"SELECT count(*) FROM weekly_weather_data WHERE year = {year} AND localityno = {locality}")   
    return check.collect()[0][0] >= 1 

check_exist_weather(locality = 23695, year = 2015), check_exist_weather(locality = 23695, year = 2016)

(True, False)

In [126]:
def get_one_year_weather_data(df, locality, year):
    """Function to get all weather data from frost.met.no limited to one year.

    Parameters:
    -----------
    localty : int
        Localty number
    year : int
        Year of data
    access_token : str
        Access token from Barentswatch API
    Returns:
    --------
    df: pandas dataframe with data
    """
    if check_exist_weather(locality=locality, year = year):
        raise DataExistsError("Data exists")

    try:
        df = get_daily_data(df = df, localityno = locality, year = year)
    except (FetchDataError, NoDataError):
        raise NoDataError
    
    try: 
        weekly_data = convert_to_weekly_data(df)
    except:
        raise NoDataError("No data available")
    try:
        write_to_cassandra(df = weekly_data, table_name = "weekly_weather_data")
    except:
        raise WritingToDatabaseError("Error writing to database")

get_one_year_weather_data(df = fish_data, locality = 23695, year = 2015)

DataExistsError: Data exists

In [96]:
localities = fish_data['localityno'].unique()
localities

array([12394, 14039, 21179, ..., 45155, 45156, 45157])

In [157]:
get_one_year_weather_data(df = fish_data, locality = 10660, year = 2017)

In [158]:
weather_data = get_df('weekly_weather_data')
weather_data["localityno"].unique()

array([12224, 32938, 10265, 15460, 33937, 45157, 23695, 12837, 10660,
       12562])

In [159]:
weather_data[weather_data['localityno'] == 10660]

Unnamed: 0,id,humidity,localityno,precipitation,temperature,week,wind_speed,year
547,2016_1_10660,75.85714,10660,22.200001,-4.285714,1,6.4,2016
417,2016_2_10660,77.14286,10660,18.700001,-2.257143,2,5.785714,2016
29,2016_3_10660,78.428574,10660,54.299999,-1.314286,3,8.4,2016
230,2016_4_10660,76.0,10660,36.099998,0.842857,4,7.7,2016
77,2016_5_10660,75.571426,10660,23.799999,-1.8,5,5.114286,2016
89,2016_6_10660,80.428574,10660,0.5,-0.7,6,3.842857,2016
69,2016_7_10660,80.85714,10660,56.0,-1.085714,7,7.171429,2016
494,2016_8_10660,80.0,10660,43.0,-4.842857,8,4.757143,2016
97,2016_9_10660,83.14286,10660,28.9,-3.171429,9,4.171429,2016
332,2016_10_10660,76.571426,10660,20.4,-2.528571,10,5.457143,2016


Year is shifted one backwards....

In [160]:
from data_utils import *

locality = 10660
year = 2017
df = get_daily_data(df = fish_data, localityno = locality, year = year)


In [167]:
def convert_to_weekly_data(weather_data):
    weather_data['date'] = pd.to_datetime(weather_data['date'])
    weather_data['week'] = weather_data['date'].dt.isocalendar().week
    weather_data['year'] = weather_data['date'].dt.isocalendar().year

    # create the weekly_weather_data_mean DataFrame where we aggregate by weekly means
    weekly_weather_data_mean = pd.DataFrame()
    weekly_weather_data_mean['week'] = weather_data['week']
    weekly_weather_data_mean['humidity'] = weather_data['humidity']
    weekly_weather_data_mean['temperature'] = weather_data['temperature']
    weekly_weather_data_mean['wind_speed'] = weather_data['wind_speed']
    weekly_weather_data_mean = weekly_weather_data_mean.groupby('week').mean()

    # same for precipitation, but we use weekly sum
    weekly_weather_data_sum = pd.DataFrame()
    weekly_weather_data_sum['week'] = weather_data['week']
    weekly_weather_data_sum['precipitation'] = weather_data['precipitation']
    weekly_weather_data_sum = weekly_weather_data_sum.groupby('week').sum()

    # merging the two dataframes
    weekly_weather_data = pd.merge(weekly_weather_data_mean, weekly_weather_data_sum, left_index=True, right_index=True)

    # add the year, week and localityno columns
    weekly_weather_data['year'] = weather_data['year']
    weekly_weather_data['week'] = weekly_weather_data.index
    weekly_weather_data['localityno'] = weather_data['localityno']
    weekly_weather_data = weekly_weather_data.reset_index(drop=True)

    # create a id column that is the concatenation of year_week_localityno
    weekly_weather_data['id'] = weekly_weather_data['year'].astype(str) + '_' + weekly_weather_data['week'].astype(str) + '_' + weekly_weather_data['localityno'].astype(str)

    return weekly_weather_data

convert_to_weekly_data(df)

Unnamed: 0,humidity,temperature,wind_speed,precipitation,year,week,localityno,id
0,75.857143,-4.285714,6.4,22.2,2016,1,10660,2016_1_10660
1,77.142857,-2.257143,5.785714,18.7,2016,2,10660,2016_2_10660
2,78.428571,-1.314286,8.4,54.3,2016,3,10660,2016_3_10660
3,76.0,0.842857,7.7,36.1,2016,4,10660,2016_4_10660
4,75.571429,-1.8,5.114286,23.8,2016,5,10660,2016_5_10660
5,80.428571,-0.7,3.842857,0.5,2016,6,10660,2016_6_10660
6,80.857143,-1.085714,7.171429,56.0,2016,7,10660,2016_7_10660
7,80.0,-4.842857,4.757143,43.0,2016,8,10660,2016_8_10660
8,83.142857,-3.171429,4.171429,28.9,2016,9,10660,2016_9_10660
9,76.571429,-2.528571,5.457143,20.4,2016,10,10660,2016_10_10660
