In [2]:
%%cql select * from music.tracks_by_album limit 5

### Create a SQL Context

In [3]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

### Create a dataframe on a cassandra table

In [4]:
val df = sqlContext.read.format("org.apache.spark.sql.cassandra").options(Map("keyspace"->"music", "table" -> "tracks_by_album")).load()	

### Explain the query plan and view some data

In [5]:
df.printSchema

In [6]:
df.explain

In [7]:
df.show

In [8]:
df.select("album_year").distinct.show

In [9]:
df.groupBy("album_year").count().show

### Group By Decade
You can use various spark sql functions.  Let's use *floor*.

In [10]:
import org.apache.spark.sql.functions._

In [58]:
df.groupBy(floor(df("album_year") / 10) * 10).count.show


+-------------------------------+-----+
|(FLOOR((album_year / 10)) * 10)|count|
+-------------------------------+-----+
|                         2000.0| 9497|
|                         1950.0|  143|
|                         1960.0| 1616|
|                         1970.0| 4346|
|                         1980.0| 6390|
|                         1990.0|14759|
+-------------------------------+-----+



### Clean it up

In [83]:
val tmp = df.groupBy((floor(df("album_year") / 10) * 10).cast("int").alias("decade")).count
tmp.show

+------+-----+
|decade|count|
+------+-----+
|  1950|  143|
|  1960| 1616|
|  1970| 4346|
|  1980| 6390|
|  1990|14759|
|  2000| 9497|
+------+-----+



In [84]:
val count_by_decade = tmp.select(tmp("decade"), tmp("count").as("album_count"))
count_by_decade.show

+------+-----------+
|decade|album_count|
+------+-----------+
|  1950|        143|
|  1960|       1616|
|  1970|       4346|
|  1980|       6390|
|  1990|      14759|
|  2000|       9497|
+------+-----------+



### Save to a new table

In [54]:
%%cql create table if not exists steve.albums_by_decade  (decade int primary key, album_count int)

In [85]:
count_by_decade.write.format("org.apache.spark.sql.cassandra").options(Map( "table" -> "albums_by_decade", "keyspace" -> "steve")).save()

### Check on it

In [87]:
%%cql select * from steve.albums_by_decade

decade,album_count
1960,1616
1950,143
1990,14759
2000,9497
1970,4346
1980,6390
