# Lesson 3 Demo 2: Focus on Primary Key

In [1]:
# Import Apache Cassandra

import cassandra


## Let's create a connection to the databasse


In [2]:
from cassandra.cluster import Cluster

try:
    cluster = Cluster(['127.0.0.1']) # If you have a locally installed Apache Cassandra instance
    session = cluster.connect()
except Exception as e:
    print(e)

### Let's create a keyspace to do our work in


In [3]:
try:
    session.execute("""
    create keyspace if not exists udacity
    with replication =
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1}"""
                   )
except Exception as e:
    print(e)

### Connect to the keyspace. Compare this to how we had create a new session in PostgreSQL.

In [4]:
try:
    session.set_keyspace('udacity')
except Exception as e:
    print(e)


##### Our query  In this case is I would like to be able to get every album that was released in a particular year.

```bash
select * from music_library WHERE year=1970
```

###### How should we model this data? What should be our Primary key and Partition Key? Since our data is looking for the __YEAR__ let's start with that. Is Partitioning our data by year a good idea? In this case our data is very small, but if we had a larger data set of albums partition by **YEAR** might be a fine choice. We would need to validate from our dataset. We want an equal spread of the data


In [5]:
query = "CREATE TABLE IF NOT EXISTS music_library"
query = query + "(year int, artist_name text, album_name text, city text, PRIMARY KEY (year))"

try:
    session.execute(query)
except Exception as e:
    print(e)


### Let's insert some data into both tables



In [6]:
query = "INSERT INTO music_library (year, artist_name, album_name, city)"
query = query + "VALUES (%s, %s, %s, %s)"

try:
    session.execute(query, (1965, "The Beatles", "Rubber Soul", "Oxford"))
except Exception as e:
    print(e)
    

try:
    session.execute(query, (1970, "The Beatles", "Let it Be", "Liverpool"))
except Exception as e:
    print(e)
    
try:
    session.execute(query, (1965, "The Who", "My Generation", "London"))
except Exception as e:
    print(e)

    
try:
    session.execute(query, (1966, "The Monkees", "The Monkees", "Los Angeles"))
except Exception as e:
    print(e)
    
    
try:
    session.execute(query, (1970, "The Carpenters", "Close To You", "San Diego"))
except Exception as e:
    print(e)


### Validate if the data was inserted 



In [7]:
query = 'SELECT * from music_library where year=1965'

try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:  # the for loop is for printing, it will not be required if executing in cqlsh
    print(row.year, row.album_name, row.artist_name)

1965 My Generation The Who


###### We are expecting two record but only one was presented, why is that? Because wee didn't create a unique primary key.

#### Let's try again and focus on making PK unique. Look at the dataset do we have anything that is unique for each row? We have a couple of options (City and Album name) but that will not get us the query we need which is looking for album's in a particular year. Let's make a composite key of the YEAR AND ALBUM NAME. This is assuming that an album name is unique to year it was released (not a bad bet). --But remember this is just a demo, you will need to understand your dataset fully (no betting)

In [8]:
query = "CREATE TABLE IF NOT EXISTS music_library1"
query = query + "(year int, artist_name text, album_name text, city text, PRIMARY KEY (year, album_name))"

try:
    session.execute(query)
except Exception as e:
    print(e)


### Let's insert some data into both tables



In [9]:
query = "INSERT INTO music_library1 (year, artist_name, album_name, city)"
query = query + "VALUES (%s, %s, %s, %s)"

try:
    session.execute(query, (1965, "The Beatles", "Rubber Soul", "Oxford"))
except Exception as e:
    print(e)
    

try:
    session.execute(query, (1970, "The Beatles", "Let it Be", "Liverpool"))
except Exception as e:
    print(e)
    
try:
    session.execute(query, (1965, "The Who", "My Generation", "London"))
except Exception as e:
    print(e)

    
try:
    session.execute(query, (1966, "The Monkees", "The Monkees", "Los Angeles"))
except Exception as e:
    print(e)
    
    
try:
    session.execute(query, (1970, "The Carpenters", "Close To You", "San Diego"))
except Exception as e:
    print(e)


### Let's Validate our data model with our original query

`select * from artist_library WHERE artist_name='The Beatles'`

In [10]:
query = "select * from music_library1 WHERE year=1965"

try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:  # the for loop is for printing, it will not be required if executing in cqlsh
    print(row.year, row.artist_name, row.album_name)

1965 The Who My Generation
1965 The Beatles Rubber Soul


#### It worked! We created a unique PK that evenly distributed our data

### For the sake of the demo, let's drop the table

In [11]:
query = "drop table music_library"

try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
query = "drop table music_library1"

try:
    
    rows = session.execute(query)
except Exception as e:
    print(e)

### And Finally close the session and cluster connection

In [12]:
session.shutdown()
cluster.shutdown()