In [1]:
from cassandra.cluster import Cluster
cluster = Cluster(["demo-db-1", "demo-db-2", "demo-db-3"])
cass = cluster.connect()

In [2]:
# cass.execute("create keyspace banking with replication={'class': 'SimpleStrategy', 'replication_factor': 2};")

<cassandra.cluster.ResultSet at 0x7fe4a586d840>

In [3]:
cass.execute("use banking")

<cassandra.cluster.ResultSet at 0x7fe4a5797c70>

In [4]:
cass.execute("drop table if exists loans")

<cassandra.cluster.ResultSet at 0x7fe4a586eb30>

In [5]:
cass.execute("""
CREATE TABLE loans (
    bank_id INT,
    bank_name TEXT STATIC,
    loan_id UUID,
    amount INT,
    state TEXT,
    PRIMARY KEY ((bank_id), amount, loan_id)
) WITH CLUSTERING ORDER BY (amount DESC, loan_id ASC)
""")

<cassandra.cluster.ResultSet at 0x7fe4a586df90>

In [6]:
print(cass.execute("describe table loans").one().create_statement)

CREATE TABLE banking.loans (
    bank_id int,
    amount int,
    loan_id uuid,
    bank_name text static,
    state text,
    PRIMARY KEY (bank_id, amount, loan_id)
) WITH CLUSTERING ORDER BY (amount DESC, loan_id ASC)
    AND additional_write_policy = '99p'
    AND bloom_filter_fp_chance = 0.01
    AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
    AND cdc = false
    AND comment = ''
    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
    AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
    AND memtable = 'default'
    AND crc_check_chance = 1.0
    AND default_time_to_live = 0
    AND extensions = {}
    AND gc_grace_seconds = 864000
    AND max_index_interval = 2048
    AND memtable_flush_period_in_ms = 0
    AND min_index_interval = 128
    AND read_repair = 'BLOCKING'
    AND speculative_retry = '99p';


In [7]:
# INSERT is really UPSERT (meaning update OR insert)
cass.execute("""
INSERT INTO loans (bank_id, bank_name)
VALUES (544, 'test2')
""")

<cassandra.cluster.ResultSet at 0x7fe4a586e410>

In [8]:
import pandas as pd
pd.DataFrame(cass.execute("select * from loans"))

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,,,test2,


In [9]:
cass.execute("""
INSERT INTO loans (bank_id, amount, loan_id)
VALUES (544, 300, UUID())
""")

<cassandra.cluster.ResultSet at 0x7fe4a586c070>

In [10]:
pd.DataFrame(cass.execute("select * from loans"))

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,300,5cad1f07-6fbc-4b89-aee4-c0c7bd223423,test2,


In [11]:
# UUID() and NOW() both return a UUID, but NOW() does better at being unique

In [12]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name, amount, loan_id, state)
VALUES (544, 'mybank', 400, NOW(), 'wi')
""")

<cassandra.cluster.ResultSet at 0x7fe4a46ac460>

In [15]:
pd.DataFrame(cass.execute("select * from loans"))

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,400,e3993dc0-7e48-11ee-a77b-b9ab7b621b2d,mybank,wi
1,544,300,5cad1f07-6fbc-4b89-aee4-c0c7bd223423,mybank,


In [16]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name, amount, loan_id, state)
VALUES (999, 'uwcu', 500, NOW(), 'il')
""")

<cassandra.cluster.ResultSet at 0x7fe486dc28f0>

In [17]:
pd.DataFrame(cass.execute("select * from loans"))

Unnamed: 0,bank_id,amount,loan_id,bank_name,state
0,544,400,e3993dc0-7e48-11ee-a77b-b9ab7b621b2d,mybank,wi
1,544,300,5cad1f07-6fbc-4b89-aee4-c0c7bd223423,mybank,
2,999,500,c3597500-7e4f-11ee-9ac4-714f42f16ef6,uwcu,il


In [18]:
cass.execute("""
create type FullName (
    first TEXT,
    last TEXT
)
""")

<cassandra.cluster.ResultSet at 0x7fe4a586f310>

In [19]:
# fast, because we have sparse table
cass.execute("""
alter table loans add (username FullName)
""")

<cassandra.cluster.ResultSet at 0x7fe4cc95a620>

In [20]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name, amount, loan_id, username)
VALUES (999, 'uwcu', 500, NOW(), {first: 'Tyler', last: 'Caraza-Harter'})
""")

<cassandra.cluster.ResultSet at 0x7fe486dc2e60>

In [26]:
cass.execute("""
INSERT INTO loans (bank_id, bank_name, amount, loan_id, username)
VALUES (999, 'uwcu', 500, NOW(), {first: 'onlyfirst'})
""")

<cassandra.cluster.ResultSet at 0x7fe4cfb0c940>

In [22]:
pd.DataFrame(cass.execute("select username, username.first, username.last from loans"))

Unnamed: 0,username,username_first,username_last
0,,,
1,,,
2,,,
3,"(Tyler, Caraza-Harter)",Tyler,Caraza-Harter


In [23]:
insert_uwcu = cass.prepare("""
INSERT INTO loans (bank_id, bank_name, amount, loan_id, username)
VALUES (999, 'uwcu', ?, NOW(), {first: ?, last: ?})
""")

In [24]:
cass.execute(insert_uwcu, (301, "TestFirst", "TestLast"))

<cassandra.cluster.ResultSet at 0x7fe4cfb0cd00>

In [27]:
pd.DataFrame(cass.execute("select * from loans"))

Unnamed: 0,bank_id,amount,loan_id,bank_name,state,username
0,544,400,e3993dc0-7e48-11ee-a77b-b9ab7b621b2d,mybank,wi,
1,544,300,5cad1f07-6fbc-4b89-aee4-c0c7bd223423,mybank,,
2,999,500,c3597500-7e4f-11ee-9ac4-714f42f16ef6,uwcu,il,
3,999,500,235599c0-7e50-11ee-8835-a9e81ec18905,uwcu,,"(Tyler, Caraza-Harter)"
4,999,500,b16bcea0-7e50-11ee-8835-a9e81ec18905,uwcu,,"(onlyfirst, None)"
5,999,301,85b948a0-7e50-11ee-a77b-b9ab7b621b2d,uwcu,,"(TestFirst, TestLast)"


In [29]:
pd.DataFrame(cass.execute("select bank_id, bank_name, AVG(amount) from loans GROUP BY bank_id"))

Unnamed: 0,bank_id,bank_name,system_avg_amount
0,544,mybank,350
1,999,uwcu,450


In [31]:
# can only group by partition (Cassandra limitation)
# pd.DataFrame(cass.execute("select state, AVG(amount) from loans GROUP BY state"))

In [32]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("cs544")
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.0')
         .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions")
         .getOrCreate())

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-157b523c-42f1-4b66-9583-f7bca20ab43c;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.12;3.4.0 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.4.0 in central
	found com.datastax.oss#java-driver-core-shaded;4.13.0 in central
	found com.datastax.oss#native-protocol;1.5.0 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found com.github.stephenc.jcip#jcip-annotations;1.0-1 in central
	found com.gith

In [33]:
spark.conf.set("spark.sql.catalog.mycat", "com.datastax.spark.connector.datasource.CassandraCatalog")
spark.conf.set("spark.sql.catalog.mycat.spark.cassandra.connection.host", "demo-db-1,demo-db-2,demo-db-3")

In [35]:
spark.sql("""
SELECT *
FROM mycat.banking.loans
""").toPandas()

                                                                                

Unnamed: 0,bank_id,amount,loan_id,state,username,bank_name
0,999,500,c3597500-7e4f-11ee-9ac4-714f42f16ef6,il,,uwcu
1,999,500,235599c0-7e50-11ee-8835-a9e81ec18905,,"(Tyler, Caraza-Harter)",uwcu
2,999,500,b16bcea0-7e50-11ee-8835-a9e81ec18905,,"(onlyfirst, None)",uwcu
3,999,301,85b948a0-7e50-11ee-a77b-b9ab7b621b2d,,"(TestFirst, TestLast)",uwcu
4,544,400,e3993dc0-7e48-11ee-a77b-b9ab7b621b2d,wi,,mybank
5,544,300,5cad1f07-6fbc-4b89-aee4-c0c7bd223423,,,mybank


In [37]:
spark.sql("""
select state, AVG(amount) 
from mycat.banking.loans
GROUP BY state
""").toPandas()

                                                                                

Unnamed: 0,state,avg(amount)
0,il,500.0
1,,400.25
2,wi,400.0


In [38]:
# Hash Partitioning Example: Not Elastic

In [40]:
import pandas as pd
import string

In [41]:
string.ascii_uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [50]:
df = pd.DataFrame({"letter": list(string.ascii_uppercase)})
df.head()

Unnamed: 0,letter
0,A
1,B
2,C
3,D
4,E


In [51]:
df["partition-before"] = df["letter"].apply(lambda letter: hash(letter) % 4)
df.head()

Unnamed: 0,letter,partition-before
0,A,0
1,B,3
2,C,1
3,D,0
4,E,3


In [52]:
df["partition-after"] = df["letter"].apply(lambda letter: hash(letter) % 5)
df.head()

Unnamed: 0,letter,partition-before,partition-after
0,A,0,0
1,B,3,4
2,C,1,1
3,D,0,3
4,E,3,3


In [54]:
(df["partition-before"] == df["partition-after"]).mean()

0.38461538461538464