# Notebook to generate graphs for main

In [3]:
import os
from pyspark.sql import SparkSession
import pandas as pd
from cassandra.cluster import Cluster

# Set pyspark env
os.environ["PYSPARK_PYTHON"] = "python"

spark = SparkSession.builder.appName('SparkCassandraApp').\
    config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.1').\
    config('spark.cassandra.connection.host', 'localhost').\
    config('spark.sql.extensions', 'com.datastax.spark.connector.CassandraSparkExtensions').\
    config('spark.sql.catalog.mycatalog', 'com.datastax.spark.connector.datasource.CassandraCatalog').\
    config('spark.cassandra.connection.port', '9042').getOrCreate()


cluster = Cluster(['localhost'], port=9042)
session = cluster.connect()
session.set_keyspace('compulsory')

def _get_df(table_name):

    (spark.read.format("org.apache.spark.sql.cassandra")
    .options(table=table_name, keyspace="compulsory")
    .load()
    .createOrReplaceTempView(table_name))

    df = spark.sql(f"select * from {table_name}").toPandas()
    return df


In [2]:
fish_data = _get_df('fish_data_full')
fish_data

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year
0,71269,,False,False,False,False,False,True,False,True,...,False,False,62.597851,20796,7.367717,Rauma,1539,Skarbukta,17,2014
1,1223266,,False,False,False,False,False,True,False,True,...,False,False,68.574753,11241,14.850883,Hadsel,1866,Bergvikodden,12,2021
2,184258,,False,False,False,False,False,True,False,True,...,False,False,66.733047,30097,13.331833,Rødøy,1836,Djupvik,6,2014
3,234585,,False,False,False,False,False,False,False,True,...,False,False,65.108650,10858,12.442233,Bindal,1811,Mulingen,25,2014
4,1493843,,False,False,False,False,False,False,False,True,...,False,False,62.538166,39017,6.102650,Giske,1532,Vikane,46,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635723,87715,0.10,False,False,False,True,True,True,False,True,...,False,False,60.127934,13235,6.149933,Kvinnherad,1224,Hessvik,10,2015
635724,192802,0.00,False,False,False,False,True,True,False,True,...,False,False,59.252201,17575,5.390517,Bokn,1145,Tollaksholmen,47,2014
635725,321029,,False,False,False,False,False,True,False,True,...,True,False,69.682747,10765,18.971338,Tromsø,1902,Nfh-Bygg,33,2016
635726,880628,,False,False,False,False,False,False,False,True,...,True,False,61.850784,10851,12.001136,Engerdal,434,Gårdsbruket Skog,28,2013


In [11]:
import plotly.express as px

fish_data_2015 = fish_data[fish_data['year'] == 2015].reset_index(drop=True)
fish_data_2015.sort_values(by = 'week', inplace=True)

In [18]:
fish_data_2015_agg = fish_data_2015.groupby(['week']).agg({'haspd': 'count'}).reset_index()
fish_data_2015_agg

Unnamed: 0,week,haspd
0,1,1731
1,2,1729
2,3,1729
3,4,1723
4,5,1722
5,6,1722
6,7,1722
7,8,1717
8,9,1715
9,10,1714


In [19]:
fig = px.bar(fish_data_2015_agg, x='week', y='haspd', title='Weekly Histogram')
fig.show()

In [4]:
lice_data = _get_df('lice_data_full').sort_values(by=['week']).reset_index(drop=True)

In [34]:
lice_data

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
0,224217,0.00,0.03,0.00,False,False,False,False,True,True,False,False,15462,4.0,1,2015
1,75232,0.37,1.25,2.72,False,False,False,False,True,True,False,False,23695,7.0,1,2015
2,75233,0.32,5.09,3.97,False,False,False,False,True,True,False,False,23695,8.0,2,2015
3,224218,0.00,0.00,0.00,False,False,True,False,True,True,False,False,15462,3.7,2,2015
4,224219,,,,False,False,False,False,False,True,False,False,15462,3.6,3,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,224266,0.00,0.00,0.00,False,False,False,False,True,True,False,False,15462,6.6,50,2015
100,75282,0.00,0.91,1.27,False,False,False,False,True,True,False,False,23695,8.8,51,2015
101,224267,0.00,0.00,0.00,False,False,False,False,True,True,False,False,15462,6.0,51,2015
102,75283,0.04,0.80,0.59,False,False,False,False,True,True,False,False,23695,8.0,52,2015


In [20]:
lice_data

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
21,75232,0.37,1.25,2.72,False,False,False,False,True,True,False,False,23695,7.0,1,2015
43,75233,0.32,5.09,3.97,False,False,False,False,True,True,False,False,23695,8.0,2,2015
48,75234,0.7,7.83,6.8,False,False,False,False,True,True,False,False,23695,7.7,3,2015
37,75235,0.76,8.01,6.73,False,False,False,False,True,True,False,False,23695,8.5,4,2015
20,75236,1.93,12.43,3.13,False,False,False,False,True,True,False,False,23695,8.2,5,2015
18,75237,3.3,12.23,3.18,False,False,False,False,True,True,False,False,23695,8.5,6,2015
35,75238,0.19,1.12,0.71,False,False,False,False,True,True,False,False,23695,7.7,7,2015
27,75239,0.07,0.55,0.1,False,False,False,False,True,True,False,False,23695,5.2,8,2015
31,75240,0.19,2.28,1.18,False,False,False,False,True,True,False,False,23695,7.0,9,2015
39,75241,0.52,2.9,0.8,False,False,False,False,True,True,False,False,23695,7.0,10,2015


In [None]:
# write a

In [44]:
import json
t = fish_data['year'].unique()
# create a dictionary of such: {"1": 2015,
#                             "2": 2016} where the years are the unique in t

year_dict = {str(i): int(t[i]) for i in range(len(t))}
year_dict

{'0': 2015}

In [58]:
# select the unique pairs of year and localityno. save as tuple

lice_data[['year', 'localityno']].drop_duplicates().values.tolist()


year_locality_dict = {str(i): {'year': int(lice_data[['year', 'localityno']].drop_duplicates().values.tolist()[i][0]),
                                 'localityno': int(lice_data[['year', 'localityno']].drop_duplicates().values.tolist()[i][1])}
                        for i in range(len(lice_data[['year', 'localityno']].drop_duplicates().values.tolist()))}
year_locality_dict

{'0': {'year': 2017, 'localityno': 24175}}

In [27]:
fish_data

Unnamed: 0,localityweekid,avgadultfemalelice,hascleanerfishdeployed,hasila,hasmechanicalremoval,haspd,hasreportedlice,hassalmonoids,hassubstancetreatments,infilteredselection,...,isonland,isslaughterholdingcage,lat,localityno,lon,municipality,municipalityno,name,week,year
47491,100698,,False,False,False,False,False,True,False,True,...,False,False,61.105118,15456,5.853883,Høyanger,1416,Eidesberget,1,2015
45887,361376,,False,False,False,False,False,True,False,True,...,True,False,69.138664,11333,17.325167,Tranøy,1927,Jøvik,1,2015
61958,149650,,False,False,False,False,False,True,False,True,...,False,False,69.662720,25855,18.177650,Tromsø,1902,Tussøya,1,2015
31333,364834,,False,False,False,False,False,True,False,True,...,False,False,70.215584,33057,22.626583,Alta,2012,Indre Lokkarfjord,1,2015
45910,445432,,False,False,False,False,False,True,False,True,...,True,False,67.916534,13191,15.350374,Steigen,1848,Nordneset,1,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36827,874347,,False,False,False,False,False,False,False,True,...,False,False,59.658268,12837,8.029850,Vinje,834,Vågen Totak,52,2015
22945,825337,,False,False,False,False,False,False,False,True,...,False,False,62.968048,15777,7.775200,Gjemnes,1557,Onsøyholmen,52,2015
22942,418883,,False,False,False,False,False,False,False,True,...,False,False,67.132133,18717,14.312550,Gildeskål,1838,Hammarvika,52,2015
23034,365683,,False,False,False,False,False,True,False,True,...,True,False,61.075619,34157,11.351467,Åmot,429,Kvernen Gård II,52,2015


In [15]:
cols = ['avgadultfemalelice', 'avgmobilelice', 'avgstationarylice']

# Create a plotly graph to plot the three columns in same plot. Use lineplot. Make it nice. Use lice_data

# Path: graph_nb.ipynb
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

fig = go.Figure()

for col in cols:
    fig.add_trace(go.Scatter(x=lice_data['week'], y=lice_data[col], name=col, mode = 'lines+markers'))

fig.update_layout(title='Lice data', xaxis_title='week', yaxis_title='Lice count')
fig.show()

In [28]:
lice_data

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
0,75232,0.37,1.25,2.72,False,False,False,False,True,True,False,False,23695,7.0,1,2015
1,75233,0.32,5.09,3.97,False,False,False,False,True,True,False,False,23695,8.0,2,2015
2,75234,0.7,7.83,6.8,False,False,False,False,True,True,False,False,23695,7.7,3,2015
3,75235,0.76,8.01,6.73,False,False,False,False,True,True,False,False,23695,8.5,4,2015
4,75236,1.93,12.43,3.13,False,False,False,False,True,True,False,False,23695,8.2,5,2015
5,75237,3.3,12.23,3.18,False,False,False,False,True,True,False,False,23695,8.5,6,2015
6,75238,0.19,1.12,0.71,False,False,False,False,True,True,False,False,23695,7.7,7,2015
7,75239,0.07,0.55,0.1,False,False,False,False,True,True,False,False,23695,5.2,8,2015
8,75240,0.19,2.28,1.18,False,False,False,False,True,True,False,False,23695,7.0,9,2015
9,75241,0.52,2.9,0.8,False,False,False,False,True,True,False,False,23695,7.0,10,2015


In [5]:
# select id 76849 and year 2015

lice_abc = lice_data[(lice_data['localityno'] == 23695) & (lice_data['year'] == 2015)]
lice_abc

Unnamed: 0,id,avgadultfemalelice,avgmobilelice,avgstationarylice,hasbathtreatment,hascleanerfishdeployed,hasinfeedtreatment,hasmechanicalremoval,hasreportedlice,hassalmonoids,isfallow,isslaughterholdingcage,localityno,seatemperature,week,year
0,75232,0.37,1.25,2.72,False,False,False,False,True,True,False,False,23695,7.0,1,2015
4,75233,0.32,5.09,3.97,False,False,False,False,True,True,False,False,23695,8.0,2,2015
8,75234,0.7,7.83,6.8,False,False,False,False,True,True,False,False,23695,7.7,3,2015
9,75235,0.76,8.01,6.73,False,False,False,False,True,True,False,False,23695,8.5,4,2015
14,75236,1.93,12.43,3.13,False,False,False,False,True,True,False,False,23695,8.2,5,2015
16,75237,3.3,12.23,3.18,False,False,False,False,True,True,False,False,23695,8.5,6,2015
18,75238,0.19,1.12,0.71,False,False,False,False,True,True,False,False,23695,7.7,7,2015
23,75239,0.07,0.55,0.1,False,False,False,False,True,True,False,False,23695,5.2,8,2015
26,75240,0.19,2.28,1.18,False,False,False,False,True,True,False,False,23695,7.0,9,2015
28,75241,0.52,2.9,0.8,False,False,False,False,True,True,False,False,23695,7.0,10,2015


In [13]:
import numpy as np
import plotly.express as px

fig = px.line(lice_abc, x='week', y='avgadultfemalelice', 
              title='Average lice count across weeks',
              labels={'avgadultfemalelice': 'Lice count', 'week': 'Week'},
              markers=True)
fig.update_xaxes(tickangle=0,
                 tickmode = 'array',
                 tickvals = np.arange(0, 52, 4))
fig.show()

In [38]:
# proportion of lice

prop = fish_data.groupby('localityno')['haspd'].value_counts(normalize=False).unstack(fill_value=0).reset_index()

In [52]:
prop

haspd,localityno,False,True
0,10027,122,0
1,10029,364,0
2,10037,364,0
3,10039,260,0
4,10040,364,0
...,...,...,...
2558,45153,4,0
2559,45154,4,0
2560,45155,3,0
2561,45156,2,0


In [73]:
top_10 = fish_data.groupby("municipality")["haspd"].value_counts(normalize=True).unstack(fill_value=0).sort_values(by = True, ascending = False).head(10).reset_index()

In [75]:
top_10

haspd,municipality,False,True
0,Surnadal,0.376374,0.623626
1,Stavanger,0.632396,0.367604
2,Osen,0.707358,0.292642
3,Hemne,0.740076,0.259924
4,Stranda,0.754203,0.245797
5,Sandøy,0.761538,0.238462
6,Hitra,0.761918,0.238082
7,Fræna,0.776923,0.223077
8,Frøya,0.784247,0.215753
9,Roan,0.794118,0.205882
