# Apache Cassandra

## Insertando datos de Twitter

## Creacción del KeySpace

In [45]:
%load_ext cql

The cql extension is already loaded. To reload it, use:
  %reload_ext cql


In [46]:
%%cql
DROP KEYSPACE twitter;

'No results.'

In [47]:
%%cql
CREATE KEYSPACE twitter 
WITH replication = {'class':'SimpleStrategy', 'replication_factor': 1};

'No results.'

## Uso del KeySpace

Se utiliza USE para cambiar el keyspace por defecto


In [48]:
%cql USE twitter;

'No results.'

## Creacción de tablas

* Los Keyspaces contienen tablas
* Las tablas contienen datos

In [49]:
%%cql
CREATE TABLE users (
    screen_name text,
    created_at timestamp,
    id_str text,
    name text,
    description text,
    profile_image_url text,
    time_zone text,
    geo_enabled boolean,
    favourites_count int,
    followers_count int,
    friends_count int,
    statuses_count int,
    PRIMARY KEY (screen_name)
);

'No results.'

In [50]:
%%cql
CREATE TYPE geo (
  type text,
  coordinates list<float>
)

'No results.'

In [51]:
%%cql
CREATE TABLE tweets (
    id text,
    created_at timestamp,
    entities list<text>,
    favorite_count int,
    retweet_count int,
    source text,
    lang text,
    text text,
    geo  frozen<geo>,
    user_mentions list<text>,
    user text,
    retweet_id text,
    PRIMARY KEY (id)
);

'No results.'

In [52]:
%%cql
CREATE INDEX tweets_hashtag ON tweets(entities)

'No results.'

In [53]:
%%cql
CREATE INDEX tweets_user ON tweets(user)

'No results.'

In [54]:
%%cql
CREATE TABLE tweets_by_user (
    user text,
    created_at timestamp,
    lang text,
    text text,
    id text,
    retweet_id text,
    PRIMARY KEY ((user), created_at)
)
WITH CLUSTERING ORDER BY (created_at DESC);

'No results.'

In [55]:
%%cql
CREATE TABLE tweets_by_hashtag (
    hashtag text,
    created_at timestamp,
    lang text,
    text text,
    user text,
    id text,
    retweet_id text,
    PRIMARY KEY ((hashtag), created_at)
)
WITH CLUSTERING ORDER BY (created_at DESC);

'No results.'

In [56]:
%%cql
CREATE TABLE hashtags (
    hashtag text,
    count counter,
    PRIMARY KEY (hashtag)
);

'No results.'

![png](./images/Cassandra%20Model.png)

## Inserción de la información

In [57]:
from cassandra.cluster import Cluster, BatchStatement, ConsistencyLevel
cluster = Cluster()
session = cluster.connect('twitter')

In [58]:
from datetime import datetime
import dateutil.parser

In [59]:
from collections import namedtuple

Geo = namedtuple('geo', ('type', 'coordinates'))
cluster.register_user_type("twitter", "geo", Geo)

In [60]:
def insert_tweet(tweet_json):
    geo = None
    retweet_id = None
    
    created_at = dateutil.parser.parse(tweet_json["created_at"])
    user = tweet_json["user"]["screen_name"]
    
    if "retweet_status" in tweet_json: retweet_id = tweet_json["retweet_status"]["id_str"]
    if "geo" in tweet_json: Geo(tweet_json["geo"]["type"], tweet_json["geo"]["coordinates"])
          
    session.execute(
"""
INSERT INTO tweets (
id,
created_at,
entities,
favorite_count,
retweet_count,
source,
lang,
text,
geo,
user_mentions,
user,
retweet_id
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
"""
    , [
        tweet_json["id_str"],
        created_at,
        tweet_json["entities"],
        tweet_json["favorite_count"],
        tweet_json["retweet_count"],
        tweet_json["source"],
        tweet_json["lang"],
        tweet_json["text"],
        geo,
        tweet_json["user_mentions"],
        user,
        retweet_id
    ])
    
    
    #Timeline del usuario
    session.execute(
"""
INSERT INTO tweets_by_user (
    user,
    created_at,
    lang,
    text,
    id,
    retweet_id
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s
)
"""
    , [
        user,
        created_at,
        tweet_json["lang"],
        tweet_json["text"],
        tweet_json["id_str"],
        retweet_id
    ])
    
   

    #Tweets por tag
    for hashtag in tweet_json["entities"]:
        
        session.execute(
"""
INSERT INTO tweets_by_hashtag (
    hashtag,
    created_at,
    lang,
    text,
    user,
    id,
    retweet_id
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
"""
        , [
            hashtag,
            created_at,
            tweet_json["lang"],
            tweet_json["text"],
            user,
            tweet_json["id_str"],
            retweet_id
        ])

In [61]:
def insert_hashtag(hashtag):
    session.execute("UPDATE hashtags SET count = count + 1 WHERE hashtag = %s", [hashtag])

In [62]:
def insert_user(user_json):
    session.execute("INSERT INTO users (screen_name, name, id_str) VALUES (%s, %s, %s)", [
            user_json["screen_name"],
            user_json["name"],
            user_json["id_str"]
            ])

In [63]:
def insert_user_mencion(user_json):
    session.execute("UPDATE users SET name = %s, id_str=%s WHERE screen_name = %s", [
            user_json["name"],
            user_json["id_str"],
            user_json["screen_name"]
            ])

In [64]:
def insert_user(user_json):
    
    time_zone = None;
    created_at = dateutil.parser.parse(user_json["created_at"])
    if "time_zone" in tweet_json: time_zone = tweet_json["time_zone"]
    
         
    session.execute(
"""
INSERT INTO users (
screen_name,
created_at,
id_str,
name,
description,
profile_image_url,
time_zone,
geo_enabled,
favourites_count,
followers_count,
friends_count,
statuses_count
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
""", [       
        user_json["screen_name"],
        created_at,
        user_json["id_str"],
        user_json["name"],
        user_json["description"],
        user_json["profile_image_url"],
        time_zone,
        user_json["geo_enabled"],
        user_json["favourites_count"],
        user_json["followers_count"],
        user_json["friends_count"],
        user_json["statuses_count"]
    ])
            

In [65]:
def parse_tweet(tweet_json):
   
    # Simplificamos la estructura user_mentions a un array de nombres de usuario
    user_mencioned_list = []
    for user_mencioned in tweet_json['user_mentions']:
        user_mencioned_list.append(user_mencioned["screen_name"])
        insert_user_mencion(user_mencioned)
        
    tweet_json['user_mentions'] = user_mencioned_list
                 
    for entity in tweet_json['entities']:
        insert_hashtag(entity)
    
    insert_tweet(tweet_json)
    
    user_json = tweet_json['user']
    insert_user(user_json)
    
    #En el caso de que el Tweet tenga un Tweet padre (retweet) lo almenamos como documento independiente 
    if 'retweeted_status' in tweet_json:
        parse_tweet(tweet_json['retweeted_status'])


In [66]:
import json
from pprintpp import pprint as pp
import sys

tweets_data_path = '../data/tweets.json'

tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    tweet_json = json.loads(line)
    parse_tweet(tweet_json)


## Querys

In [67]:
print "Users", session.execute("SELECT count(*) from users")[0].count
print "Tweets",session.execute("SELECT count(*) from tweets")[0].count
print "Tweets por Usuario",session.execute("SELECT count(*) from tweets_by_user")[0].count
print "Tweets por Hashtag",session.execute("SELECT count(*) from tweets_by_hashtag")[0].count
print "Hashtags",session.execute("SELECT count(*) from hashtags")[0].count

Users 1387
Tweets 2800
Tweets por Usuario 2768
Tweets por Hashtag 9075
Hashtags 629




## Consulta de usuarios

In [68]:
%%cql
SELECT * from users
WHERE screen_name IN  ( 'Grandite', 'pinaldave')

screen_name,created_at,description,favourites_count,followers_count,friends_count,geo_enabled,id_str,name,profile_image_url,statuses_count,time_zone
Grandite,2008-11-28 19:24:48,"Professional Business Process, Data and UML Modeling Tools",0,369,127,False,17717256,Grandite,http://pbs.twimg.com/profile_images/1373004005/G_2400dpi_at_red-tr_normal.png,8468,
pinaldave,2007-05-14 02:07:41,"SQL, SQL Server, MySQL, Big Data and http://t.co/AdcHKNHeDc, Independent Consultant",0,14280,555,False,6020572,Pinal Dave,http://pbs.twimg.com/profile_images/2741455121/a3b2ea7afacceea3098a3bd2fe11a082_normal.png,44873,


## Tweets de un determinado usuario

In [69]:
%%cql
SELECT * from tweets
WHERE user = 'Grandite'
LIMIT 2

id,created_at,entities,favorite_count,geo,lang,retweet_count,retweet_id,source,text,user,user_mentions
658043028339302400,2015-10-24 22:10:58,,0,,en,0,,"<a href=""https://about.twitter.com/products/tweetdeck"" rel=""nofollow"">TweetDeck</a>",.@Grandite's Not Only NoSQL https://t.co/lyVRzT59LD featuring @AxelTroike @Rick345 @DataStax @not_only_NoSQL,Grandite,"[u'Grandite', u'AxelTroike', u'Rick345', u'DataStax', u'not_only_NoSQL']"
657584598008770560,2015-10-23 15:49:20,,0,,en,0,,"<a href=""https://about.twitter.com/products/tweetdeck"" rel=""nofollow"">TweetDeck</a>",.@Grandite's Not Only NoSQL https://t.co/2YuydUtS6F featuring @not_only_NoSQL @Rick345 @craigmullins,Grandite,"[u'Grandite', u'not_only_NoSQL', u'Rick345', u'craigmullins']"


## Hashtags mas populares

In [70]:
result_cursor = session.execute("select * from hashtags")

def getCount(item):
    return item.count

for row in sorted(result_cursor, key=getCount, reverse=True)[0:10]:
    print row.hashtag, "-", row.count

NoSQL - 1830
BigData - 843
Java - 794
MongoDB - 776
SoapUi - 756
Hadoop - 703
nosql - 606
hive - 473
bigdata - 268
MongoDb - 171


## Timeline del usuario

In [71]:
%%cql
SELECT * from tweets_by_user
WHERE user = 'Grandite'
LIMIT 2

user,created_at,id,lang,retweet_id,text
Grandite,2015-10-24 22:10:58,658043028339302400,en,,.@Grandite's Not Only NoSQL https://t.co/lyVRzT59LD featuring @AxelTroike @Rick345 @DataStax @not_only_NoSQL
Grandite,2015-10-23 15:49:20,657584598008770560,en,,.@Grandite's Not Only NoSQL https://t.co/2YuydUtS6F featuring @not_only_NoSQL @Rick345 @craigmullins


## Querys que contienen un determinado tag

In [72]:
%%cql
SELECT * from tweets
WHERE entities CONTAINS 'NoSQL'
LIMIT 2

id,created_at,entities,favorite_count,geo,lang,retweet_count,retweet_id,source,text,user,user_mentions
656958312899092484,2015-10-21 22:20:42,"[u'BigData', u'Java', u'SoapUi', u'Hadoop', u'NoSQL', u'hive', u'MongoDB']",0,,en,2,,"<a href=""https://gamedevbrain.wordpress.com/"" rel=""nofollow"">BigDataTweetBot</a>",RT @geneolot: Hottest in a sense of Technology and women #BigData #Java #SoapUi #Hadoop #NoSQL #hive #MongoDB https://t.co/CD55jAQOx0,BigDataTweetBot,[u'geneolot']
657012074699210752,2015-10-22 01:54:19,"[u'BigData', u'Java', u'SoapUi', u'Hadoop', u'NoSQL', u'hive', u'MongoDB']",0,,sl,1,,"<a href=""http://ifttt.com"" rel=""nofollow"">IFTTT</a>",geneolot: #BigData #Java #SoapUi #Hadoop #NoSQL #hive #MongoDB https://t.co/SlxhhHnaEz,ClearGrip,


## Tweets por hashtag ordernados por fecha de creacción

In [73]:
%%cql
SELECT * from tweets_by_hashtag
WHERE hashtag = 'NoSQL'
    and created_at <= '2014-12-23'
LIMIT 4

hashtag,created_at,id,lang,retweet_id,text,user
NoSQL,2014-12-22 09:47:02,546965124247207936,en,,Quickly get started with #Java and #Cloudant #NoSQL DB service on #IBMBluemix. By @hansb001 http://t.co/CHtsvlOa0b via @wordpressdotcom,HansB001
