# Insertando datos de Twitter en Cassandra

![png](./images/Cassandra.png)

## Creacción del KeySpace

In [None]:
%load_ext cql

In [None]:
%%cql
DROP KEYSPACE twitter;

In [None]:
%%cql
CREATE KEYSPACE twitter 
WITH replication = {'class':'SimpleStrategy', 'replication_factor': 1};

## Uso del KeySpace

Se utiliza USE para cambiar el keyspace por defecto


In [None]:
%cql USE twitter;

## Creacción de tablas

* Los Keyspaces contienen tablas
* Las tablas contienen datos

In [None]:
%%cql
CREATE TABLE users (
    screen_name text,
    created_at timestamp,
    id_str text,
    name text,
    description text,
    profile_image_url text,
    time_zone text,
    geo_enabled boolean,
    favourites_count int,
    followers_count int,
    friends_count int,
    statuses_count int,
    PRIMARY KEY (screen_name)
);

In [None]:
%%cql
CREATE TYPE geo (
  type text,
  coordinates list<float>
)

In [None]:
%%cql
CREATE TABLE tweets (
    id text,
    created_at timestamp,
    entities list<text>,
    favorite_count int,
    retweet_count int,
    source text,
    lang text,
    text text,
    geo  frozen<geo>,
    user_mentions list<text>,
    user text,
    retweet_id text,
    PRIMARY KEY (id)
);

In [None]:
%%cql
CREATE INDEX tweets_hashtag ON tweets(entities)

In [None]:
%%cql
CREATE INDEX tweets_user ON tweets(user)

In [None]:
%%cql
CREATE TABLE tweets_by_user (
    user text,
    created_at timestamp,
    lang text,
    text text,
    id text,
    retweet_id text,
    PRIMARY KEY ((user), created_at)
)
WITH CLUSTERING ORDER BY (created_at DESC);

In [None]:
%%cql
CREATE TABLE tweets_by_hashtag (
    hashtag text,
    created_at timestamp,
    lang text,
    text text,
    user text,
    id text,
    retweet_id text,
    PRIMARY KEY ((hashtag), created_at)
)
WITH CLUSTERING ORDER BY (created_at DESC);

In [None]:
%%cql
CREATE TABLE hashtags (
    hashtag text,
    count counter,
    PRIMARY KEY (hashtag)
);

![png](./images/Cassandra_Model.png)

## Inserción de la información

In [None]:
from cassandra.cluster import Cluster, BatchStatement, ConsistencyLevel
cluster = Cluster()
session = cluster.connect('twitter')

In [None]:
from datetime import datetime
import dateutil.parser

In [None]:
from collections import namedtuple

Geo = namedtuple('geo', ('type', 'coordinates'))
cluster.register_user_type("twitter", "geo", Geo)

In [None]:
def insert_tweet(tweet_json):
    geo = None
    retweet_id = None
    
    created_at = dateutil.parser.parse(tweet_json["created_at"])
    user = tweet_json["user"]["screen_name"]
    
    if "retweet_status" in tweet_json: retweet_id = tweet_json["retweet_status"]["id_str"]
    if "geo" in tweet_json: Geo(tweet_json["geo"]["type"], tweet_json["geo"]["coordinates"])
          
    session.execute(
"""
INSERT INTO tweets (
id,
created_at,
entities,
favorite_count,
retweet_count,
source,
lang,
text,
geo,
user_mentions,
user,
retweet_id
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
"""
    , [
        tweet_json["id_str"],
        created_at,
        tweet_json["entities"],
        tweet_json["favorite_count"],
        tweet_json["retweet_count"],
        tweet_json["source"],
        tweet_json["lang"],
        tweet_json["text"],
        geo,
        tweet_json["user_mentions"],
        user,
        retweet_id
    ])
    
    
    #Timeline del usuario
    session.execute(
"""
INSERT INTO tweets_by_user (
    user,
    created_at,
    lang,
    text,
    id,
    retweet_id
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s
)
"""
    , [
        user,
        created_at,
        tweet_json["lang"],
        tweet_json["text"],
        tweet_json["id_str"],
        retweet_id
    ])
    
   

    #Tweets por tag
    for hashtag in tweet_json["entities"]:
        
        session.execute(
"""
INSERT INTO tweets_by_hashtag (
    hashtag,
    created_at,
    lang,
    text,
    user,
    id,
    retweet_id
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
"""
        , [
            hashtag,
            created_at,
            tweet_json["lang"],
            tweet_json["text"],
            user,
            tweet_json["id_str"],
            retweet_id
        ])

In [None]:
def insert_hashtag(hashtag):
    session.execute("UPDATE hashtags SET count = count + 1 WHERE hashtag = %s", [hashtag])

In [None]:
def insert_user(user_json):
    session.execute("INSERT INTO users (screen_name, name, id_str) VALUES (%s, %s, %s)", [
            user_json["screen_name"],
            user_json["name"],
            user_json["id_str"]
            ])

In [None]:
def insert_user_mencion(user_json):
    session.execute("UPDATE users SET name = %s, id_str=%s WHERE screen_name = %s", [
            user_json["name"],
            user_json["id_str"],
            user_json["screen_name"]
            ])

In [None]:
def insert_user(user_json):
    
    time_zone = None;
    created_at = dateutil.parser.parse(user_json["created_at"])
    if "time_zone" in tweet_json: time_zone = tweet_json["time_zone"]
    
         
    session.execute(
"""
INSERT INTO users (
screen_name,
created_at,
id_str,
name,
description,
profile_image_url,
time_zone,
geo_enabled,
favourites_count,
followers_count,
friends_count,
statuses_count
) VALUES (
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
""", [       
        user_json["screen_name"],
        created_at,
        user_json["id_str"],
        user_json["name"],
        user_json["description"],
        user_json["profile_image_url"],
        time_zone,
        user_json["geo_enabled"],
        user_json["favourites_count"],
        user_json["followers_count"],
        user_json["friends_count"],
        user_json["statuses_count"]
    ])
            

In [None]:
def parse_tweet(tweet_json):
   
    # Simplificamos la estructura user_mentions a un array de nombres de usuario
    user_mencioned_list = []
    for user_mencioned in tweet_json['user_mentions']:
        user_mencioned_list.append(user_mencioned["screen_name"])
        insert_user_mencion(user_mencioned)
        
    tweet_json['user_mentions'] = user_mencioned_list
                 
    for entity in tweet_json['entities']:
        insert_hashtag(entity)
    
    insert_tweet(tweet_json)
    
    user_json = tweet_json['user']
    insert_user(user_json)
    
    #En el caso de que el Tweet tenga un Tweet padre (retweet) lo almenamos como documento independiente 
    if 'retweeted_status' in tweet_json:
        parse_tweet(tweet_json['retweeted_status'])


In [None]:
import json
from pprintpp import pprint as pp
import sys

tweets_data_path = './data/tweets.json'

tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    tweet_json = json.loads(line)
    parse_tweet(tweet_json)


## Querys

In [None]:
session.execute("SELECT count(*) from users").one()

In [None]:
print("Users", session.execute("SELECT count(*) from users").one().count)
print("Tweets",session.execute("SELECT count(*) from tweets").one().count)
print("Tweets por Usuario",session.execute("SELECT count(*) from tweets_by_user").one().count)
print("Tweets por Hashtag",session.execute("SELECT count(*) from tweets_by_hashtag").one().count)
print("Hashtags",session.execute("SELECT count(*) from hashtags").one().count)

## Consulta de usuarios

In [None]:
%%cql
SELECT * from users
WHERE screen_name IN  ( 'Grandite', 'pinaldave')

## Tweets de un determinado usuario

In [None]:
%%cql
SELECT * from tweets
WHERE user = 'Grandite'
LIMIT 2

## Hashtags mas populares

In [None]:
result_cursor = session.execute("select * from hashtags")

def getCount(item):
    return item.count

for row in sorted(result_cursor, key=getCount, reverse=True)[0:10]:
    print(row.hashtag, "-", row.count)

## Timeline del usuario

In [None]:
%%cql
SELECT * from tweets_by_user
WHERE user = 'Grandite'
LIMIT 2

## Querys que contienen un determinado tag

In [None]:
%%cql
SELECT * from tweets
WHERE entities CONTAINS 'NoSQL'
LIMIT 2

## Tweets por hashtag ordernados por fecha de creacción

In [None]:
%%cql
SELECT * from tweets_by_hashtag
WHERE hashtag = 'NoSQL'
   and created_at <= '2015-10-25'
LIMIT 4