In [1]:
from cassandra.cluster import Cluster
cluster = Cluster(["25-cassandra-db-1"])
cass = cluster.connect()

In [2]:
# cass.execute("QUERY HERE")

In [4]:
list(cass.execute("describe keyspaces"))

[Row(keyspace_name='banking', type='keyspace', name='banking'),
 Row(keyspace_name='system', type='keyspace', name='system'),
 Row(keyspace_name='system_auth', type='keyspace', name='system_auth'),
 Row(keyspace_name='system_distributed', type='keyspace', name='system_distributed'),
 Row(keyspace_name='system_schema', type='keyspace', name='system_schema'),
 Row(keyspace_name='system_traces', type='keyspace', name='system_traces'),
 Row(keyspace_name='system_views', type='keyspace', name='system_views'),
 Row(keyspace_name='system_virtual_schema', type='keyspace', name='system_virtual_schema')]

In [5]:
import pandas as pd
pd.DataFrame(cass.execute("describe keyspaces"))

Unnamed: 0,keyspace_name,type,name
0,banking,keyspace,banking
1,system,keyspace,system
2,system_auth,keyspace,system_auth
3,system_distributed,keyspace,system_distributed
4,system_schema,keyspace,system_schema
5,system_traces,keyspace,system_traces
6,system_views,keyspace,system_views
7,system_virtual_schema,keyspace,system_virtual_schema


In [7]:
cass.execute("use banking")

<cassandra.cluster.ResultSet at 0x7f26e8067310>

In [8]:
cass.execute("drop table if exists loans")

<cassandra.cluster.ResultSet at 0x7f26e25f1a80>

In [9]:
# TWIST: within a bank's partition, we want the biggest loans first
cass.execute("""
CREATE TABLE loans (
    bank_id int,
    bank_name text STATIC,
    loan_id UUID,
    amount int,
    state text,
    PRIMARY KEY (bank_id, amount, loan_id)
) WITH CLUSTERING ORDER BY (amount DESC)
""")

<cassandra.cluster.ResultSet at 0x7f26e25f3970>

In [13]:
print(cass.execute("describe table loans").one().create_statement)

CREATE TABLE banking.loans (
    bank_id int,
    amount int,
    loan_id uuid,
    bank_name text static,
    state text,
    PRIMARY KEY (bank_id, amount, loan_id)
) WITH CLUSTERING ORDER BY (amount DESC, loan_id ASC)
    AND additional_write_policy = '99p'
    AND bloom_filter_fp_chance = 0.01
    AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
    AND cdc = false
    AND comment = ''
    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
    AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
    AND crc_check_chance = 1.0
    AND default_time_to_live = 0
    AND extensions = {}
    AND gc_grace_seconds = 864000
    AND max_index_interval = 2048
    AND memtable_flush_period_in_ms = 0
    AND min_index_interval = 128
    AND read_repair = 'BLOCKING'
    AND speculative_retry = '99p';


In [15]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name) VALUES (544, 'test')
""")

<cassandra.cluster.ResultSet at 0x7f26e2661840>

In [16]:
def show_table():
    return pd.DataFrame(cass.execute("select * from loans"))
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,,,test,


In [18]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name) VALUES (544, 'test')
""")
# cassandra insert is really "upsert" (insert OR update it)
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,,,test,


In [19]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name, loan_id) VALUES (544, 'test', UUID())
""")

InvalidRequest: Error from server: code=2200 [Invalid query] message="Some clustering keys are missing: amount"

In [20]:
cass.execute("""
INSERT INTO loans (bank_id, amount, loan_id) VALUES (544, 300, UUID())
""")
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,300,1f737040-e747-4994-83c5-8da1e197e0b8,test,


In [22]:
cass.execute("""
INSERT INTO loans
(bank_id, bank_name, amount, loan_id, state)
VALUES
(544, 'mybank', 400, NOW(), 'WI')
""")
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,400,421d5ea0-cfd8-11ed-b382-3b27c3155e80,mybank,WI
1,544,300,1f737040-e747-4994-83c5-8da1e197e0b8,mybank,


In [23]:
cass.execute("""
INSERT INTO loans
(bank_id, bank_name, amount, loan_id, state)
VALUES
(999, 'bank2', 500, NOW(), 'IL')
""")
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,400,421d5ea0-cfd8-11ed-b382-3b27c3155e80,mybank,WI
1,544,300,1f737040-e747-4994-83c5-8da1e197e0b8,mybank,
2,999,500,8b9cd470-cfd8-11ed-b8c2-d38a619712d0,bank2,IL


# Custom Types

In [24]:
cass.execute("create type FullName (first text, last text)")

<cassandra.cluster.ResultSet at 0x7f26e262d720>

In [25]:
cass.execute("alter table loans add (username FullName)")
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state,username
0,544,400,421d5ea0-cfd8-11ed-b382-3b27c3155e80,mybank,WI,
1,544,300,1f737040-e747-4994-83c5-8da1e197e0b8,mybank,,
2,999,500,8b9cd470-cfd8-11ed-b8c2-d38a619712d0,bank2,IL,


In [28]:
cass.execute("""
INSERT INTO loans
(bank_id, amount, loan_id, state, username)
VALUES
(999, 600, NOW(), 'IL', {first: 'Tyler', last: 'Caraza-Harter'})
""")
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state,username
0,544,400,421d5ea0-cfd8-11ed-b382-3b27c3155e80,mybank,WI,
1,544,300,1f737040-e747-4994-83c5-8da1e197e0b8,mybank,,
2,999,600,f146fbc0-cfd8-11ed-b382-3b27c3155e80,bank2,IL,"(Tyler, Caraza-Harter)"
3,999,500,8b9cd470-cfd8-11ed-b8c2-d38a619712d0,bank2,IL,


In [29]:
pd.DataFrame(cass.execute("""
SELECT username, username.first, username.last
FROM loans
"""))

Unnamed: 0,username,username_first,username_last
0,,,
1,,,
2,"(Tyler, Caraza-Harter)",Tyler,Caraza-Harter
3,,,


# Prepared Statements

In [30]:
insert_544_bank = cass.prepare("""
INSERT INTO loans
(bank_id, amount, loan_id, username)
VALUES
(544, ?, NOW(), {first: ?, last: ?})
""")

In [31]:
cass.execute(insert_544_bank, (321, "Tyler", "Caraza-Harter"))

<cassandra.cluster.ResultSet at 0x7f26e2135de0>

In [32]:
show_table()

Unnamed: 0,bank_id,amount,loan_id,bank_name,state,username
0,544,400,421d5ea0-cfd8-11ed-b382-3b27c3155e80,mybank,WI,
1,544,321,623ab1a0-cfd9-11ed-9182-279d29117f93,mybank,,"(Tyler, Caraza-Harter)"
2,544,300,1f737040-e747-4994-83c5-8da1e197e0b8,mybank,,
3,999,600,f146fbc0-cfd8-11ed-b382-3b27c3155e80,bank2,IL,"(Tyler, Caraza-Harter)"
4,999,500,8b9cd470-cfd8-11ed-b8c2-d38a619712d0,bank2,IL,


# Group BY

In [33]:
pd.DataFrame(cass.execute("""
SELECT bank_id, bank_name, AVG(amount)
FROM loans
GROUP BY bank_id
"""))

Unnamed: 0,bank_id,bank_name,system_avg_amount
0,544,mybank,340
1,999,bank2,550


In [34]:
pd.DataFrame(cass.execute("""
SELECT state, AVG(amount)
FROM loans
GROUP BY state
"""))

InvalidRequest: Error from server: code=2200 [Invalid query] message="Group by is currently only supported on the columns of the PRIMARY KEY, got state"

# Spark

In [35]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("cs544")
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.2.0')
         .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions")
         .getOrCreate())

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e5fa0b88-e785-449f-a7a7-5bdbe3601f9c;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.12;3.2.0 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.2.0 in central
	found com.datastax.oss#java-driver-core-shaded;4.13.0 in central
	found com.datastax.oss#native-protocol;1.5.0 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found com.github.stephenc.jcip#jcip-annotations;1.0-1 in central
	found com.gith

### Catalog (like an external Spark table)

In [36]:
spark.conf.set("spark.sql.catalog.mycat", "com.datastax.spark.connector.datasource.CassandraCatalog")
spark.conf.set("spark.sql.catalog.mycat.spark.cassandra.connection.host", "25-cassandra-db-1:9042")

In [39]:
spark.sql("show tables").show() # this does not include catalogs

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [41]:
spark.sql("""
SELECT state, AVG(amount)
FROM mycat.banking.loans
GROUP BY state
""").toPandas()

                                                                                

Unnamed: 0,state,avg(amount)
0,,310.5
1,WI,400.0
2,IL,550.0


In [43]:
spark.table("mycat.banking.loans").createOrReplaceTempView("loans")

In [44]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |    loans|       true|
+---------+---------+-----------+



In [45]:
spark.table("loans").cache()

DataFrame[bank_id: int, amount: int, loan_id: string, state: string, username: struct<first:string,last:string>, bank_name: string]

In [49]:
spark.sql("""
SELECT state, AVG(amount)
FROM mycat.banking.loans
GROUP BY state
""").toPandas()

Unnamed: 0,state,avg(amount)
0,IL,550.0
1,,310.5
2,WI,400.0


In [50]:
# bunch of inserts
cass.execute(insert_544_bank, (321, "Tyler", "Caraza-Harter"))
cass.execute(insert_544_bank, (321, "Tyler", "Caraza-Harter"))
cass.execute(insert_544_bank, (321, "Tyler", "Caraza-Harter"))

<cassandra.cluster.ResultSet at 0x7f26cfae1cc0>

In [51]:
spark.sql("""
SELECT state, AVG(amount)
FROM mycat.banking.loans
GROUP BY state
""").toPandas()

Unnamed: 0,state,avg(amount)
0,IL,550.0
1,,310.5
2,WI,400.0


In [52]:
spark.sql("refresh table loans")

DataFrame[]

In [53]:
spark.sql("""
SELECT state, AVG(amount)
FROM mycat.banking.loans
GROUP BY state
""").toPandas()

Unnamed: 0,state,avg(amount)
0,,316.8
1,WI,400.0
2,IL,550.0
