In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import uuid
from os.path import expanduser
from re import findall, compile
from pandas import read_csv, merge, to_datetime, DataFrame

In [3]:
home = expanduser('~')
mypath = f'{home}/Downloads/'

## Connect to DataStax

In [4]:
cloud_config= {'secure_connect_bundle': mypath+'secure-connect-bigdataproject2022.zip'}

with open(mypath+'GeneratedToken.csv', 'r', encoding='utf-8') as file:
    try:
        content = file.readlines()        
    except FileNotFoundError:
        raise('File not found')

tokens = findall(r'"(.*?)"', content[1])
client_id = tokens[0]
client_secret = tokens[1]

In [5]:
auth_provider = PlainTextAuthProvider(client_id, client_secret)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect('bigdata')

## Q1

## Preprocess and denormalize data for Q1

In [6]:
movies = read_csv(mypath+'archive/movie.csv')
print(movies.shape)
movies.head()

(27278, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings = read_csv(mypath+'archive/rating.csv', usecols=['movieId', 'rating', 'timestamp'])
print(ratings.shape)
ratings.head()

(20000263, 3)


Unnamed: 0,movieId,rating,timestamp
0,2,3.5,2005-04-02 23:53:47
1,29,3.5,2005-04-02 23:31:16
2,32,3.5,2005-04-02 23:33:39
3,47,3.5,2005-04-02 23:32:07
4,50,3.5,2005-04-02 23:29:40


In [8]:
## maybe group by at first?
movies_ratings = merge(left=movies, right=ratings, left_on='movieId', right_on='movieId')
print(movies_ratings.shape)
movies_ratings.head()

(20000263, 5)


Unnamed: 0,movieId,title,genres,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5,2009-01-02 01:13:41


### Insert data to table movies_by_date

In [9]:
def insert_into_movies_by_date(df, n):
    # insert data for Q1
    for index, row in df[0::n].iterrows():
        # year | timestamp | movieid | rating | title
        year = row.timestamp.year # int
        tmstp = row.timestamp # timestamp
        movie_id = row.movieId # int
        r = row.rating # float
        title = row.title.replace("'", "") # string
        try:
            q = f"INSERT INTO movies_by_date(year, timestamp, movieid, rating, title) VALUES({year}, '{tmstp}', {movie_id}, {r}, '{title}')"
            session.execute(q)
            return True
        except:
            return False

### Retrieve data from movies_by_date screenshot

In [10]:
def select_from_movies_by_date(start_date, end_date):
    Q1 = f"SELECT * FROM movies_by_date WHERE year=? and timestamp >= ? and timestamp <= ?;"
    prepared = session.prepare(Q1)
    sd = to_datetime(start_date)
    ed = to_datetime(end_date)
    if sd.year == ed.year:
        rows = session.execute(prepared, (sd.year, sd, ed))
        for row in rows:
            print(f"Title: {row.title} Rating: {row.rating:.1f}")

In [11]:
# insert_into_movies_by_date(movies_ratings, 1)
select_from_movies_by_date('2013-01-01', '2013-02-01')

Title: Dark Knight, The (2008) Rating: 5.0
Title: Beasts of the Southern Wild (2012) Rating: 3.0
Title: Rise of the Planet of the Apes (2011) Rating: 3.0
Title: Me and You and Everyone We Know (2005) Rating: 4.5
Title: Matrix Revolutions, The (2003) Rating: 3.0
Title: Fight Club (1999) Rating: 3.5
Title: Double Jeopardy (1999) Rating: 3.0
Title: Hannibal Rising (2007) Rating: 4.0
Title: Enduring Love (2004) Rating: 3.5
Title: Vegas Vacation (National Lampoons Las Vegas Vacation) (1997) Rating: 3.0
Title: Nightmare on Elm Street, A (1984) Rating: 4.0
Title: Good Morning, Vietnam (1987) Rating: 3.5
Title: Scarface (1983) Rating: 4.0
Title: Big Night (1996) Rating: 3.0
Title: Best Exotic Marigold Hotel, The (2011) Rating: 3.5


## Q2

### Preprocess and denormalize data for Q2

In [12]:
tags = read_csv(mypath+'archive/tag.csv', usecols=['movieId', 'tag'])
print(tags.shape)
tags.head()

(465564, 2)


Unnamed: 0,movieId,tag
0,4141,Mark Waters
1,208,dark hero
2,353,dark hero
3,521,noir thriller
4,592,dark hero


In [13]:
movies_tag_list = tags.groupby(by='movieId')['tag'].apply(list).reset_index(name='tags')
print(movies_tag_list.shape)
movies_tag_list.head()

(19545, 2)


Unnamed: 0,movieId,tags
0,1,"[Watched, computer animation, Disney animated ..."
1,2,"[time travel, adapted from:book, board game, c..."
2,3,"[old people that is actually funny, sequel fev..."
3,4,"[chick flick, revenge, characters, chick flick..."
4,5,"[Diane Keaton, family, sequel, Steve Martin, w..."


In [14]:
movies_ratings_tags = merge(left=movies_tag_list, right=movies_ratings.loc[:, 'movieId':'rating'], left_on='movieId', right_on='movieId')
print(movies_ratings_tags.shape)
movies_ratings_tags.head()

(19854570, 5)


Unnamed: 0,movieId,tags,title,genres,rating
0,1,"[Watched, computer animation, Disney animated ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
1,1,"[Watched, computer animation, Disney animated ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0
2,1,"[Watched, computer animation, Disney animated ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
3,1,"[Watched, computer animation, Disney animated ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
4,1,"[Watched, computer animation, Disney animated ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5


In [15]:
def insert_into_movies_by_title(df, n=5000):
    # insert data for Q2
    sep = '('
    for index, row in df[0::n].iterrows():
        #  title | movieid | id | genre | rating | tags
        title = row.title[:-7].replace("'", "").split(sep, 1)[0]
        movie_id = row.movieId
        idd = uuid.uuid4()
        genre = row.genres
        r = row.rating
        tags = {}
        for tag in row.tags:
            t = str(tag).replace("'", "")
            if t in tags and t != '':
                tags[t] += 1
            else:
                tags[t] = 1
        try:
            q = f"INSERT INTO movies_by_title(title, movieid, id, genre, rating, tags)VALUES('{title}', {movie_id}, {idd}, '{genre}', {r}, {tags})"
            session.execute(q)
            return True
        except:
            return False

In [16]:
def select_from_movies_by_title(session, title):
    Q2 = f"SELECT title, genre, tags, avg(rating) as avg FROM movies_by_title WHERE title='{title}' GROUP BY movieid;"
    rows = session.execute(Q2)
    for row in rows:
        sorted_tags = sorted(row.tags.items(), key=lambda x: x[1], reverse=True)
        print(f"Title: {row.title} Rating: {row.avg:.1f} Genre: {row.genre} Most popular tags: {sorted_tags[:5]}")

In [17]:
# insert_into_movies_by_title(movies_ratings_tags, 1)
select_from_movies_by_title(session, "Jumanji")

Title: Jumanji Rating: 3.3 Genre: Adventure|Children|Fantasy Most popular tags: [('Robin Williams', 20), ('fantasy', 11), ('time travel', 11), ('animals', 9), ('board game', 7)]


## Q3

### Insert data to movies_by_genre

In [18]:
def insert_into_movies_by_genre(df, n):
    # insert data for Q3
    pattern = compile(r"\((\d+)\)")
    sep = '('
    for index, row in df[0::n].iterrows():
        #  genre | release_date | movieid | rating | id | title
        genre = row.genres
        release_date = pattern.findall(row.title)[-1]
        title = row.title[:-6].replace("'", "").split(sep, 1)[0]
        movie_id = row.movieId
        r = row.rating
        idd = uuid.uuid4()
        
        q = f"INSERT INTO movies_by_genre(genre, release_date, movieid, rating, id, title)VALUES('{genre}', {release_date}, {movie_id}, {r}, {idd}, '{title}')"
        session.execute(q)

### Retrieve data from movies_by_genre

In [20]:
def select_from_movies_by_genre(genre, limit=10):
    Q3 = f"SELECT title, avg(rating) AS avg, release_date FROM movies_by_genre WHERE genre='{genre}' GROUP BY release_date, movieid ORDER BY release_date ASC LIMIT {limit};"
    prepared = session.prepare(Q3)
    rows = session.execute(prepared)
    for row in rows:
        print(f"Title: {row.title} Rating: {row.avg:.1f} Release Date: {row.release_date}")
    

In [21]:
# insert_into_movies_by_genre(movies_ratings, 1)
select_from_movies_by_genre('Adventure')

Title: Treasure Island  Rating: 4.0 Release Date: 1934
Title: Wee Willie Winkie  Rating: 4.0 Release Date: 1937
Title: If I Were King  Rating: 3.5 Release Date: 1938
Title: Adventures of Tom Sawyer, The  Rating: 3.0 Release Date: 1938
Title: Man in the Iron Mask, The  Rating: 3.2 Release Date: 1939
Title: Mark of Zorro, The  Rating: 3.0 Release Date: 1940
Title: Black Swan, The  Rating: 3.2 Release Date: 1942
Title: Prisoner of Zenda, The  Rating: 4.0 Release Date: 1952
Title: Snows of Kilimanjaro, The  Rating: 3.0 Release Date: 1952
Title: Macao  Rating: 3.8 Release Date: 1952
