# Almacenando datos de Twitter en Neo4j

![png](http://d20tdhwx2i89n1.cloudfront.net/image/upload/t_next_gen_article_large_767/btp7c4imyevfdt9icxlo.jpg)

In [None]:
%load_ext cypher

In [None]:
%%cypher
MATCH (n)
OPTIONAL MATCH (n)-[r]-()
DELETE n,r

In [None]:
%cypher CREATE CONSTRAINT ON (t:Tweet) ASSERT t.id IS UNIQUE

In [None]:
%cypher CREATE CONSTRAINT ON (u:User) ASSERT u.username IS UNIQUE

In [None]:
%cypher CREATE CONSTRAINT ON (h:HashTag) ASSERT h.hashtag IS UNIQUE

![png](./../images/Model.png)

In [None]:
from pprintpp import pprint as pp


In [None]:
from py2neo import Graph, Relationship, Node
import json

graph = Graph()

In [None]:
def parse_user(user_json): 
    node_user = Node("User", username= user_json.get('screen_name'), name = user_json.get('name'))
    graph.merge(node_user)
    
    node_user.update(created_at = user_json.get('user_json', None),
                     description = user_json.get('description', None),
                     favourites_count = user_json.get('favourites_count', None),
                     followers_count = user_json.get('followers_count', None),
                     friends_count = user_json.get('friends_count', None),
                     statuses_count = user_json.get('statuses_count', None),
                     time_zone = user_json.get('time_zone', None),
                     profile_image_url = user_json.get('profile_image_url', None))
    graph.push(node_user)
                
    return node_user

In [None]:
def parse_tweet(tweet_json):
    node_user = parse_user(tweet_json['user'])
    
    node_tweet = Node("Tweet", id= tweet_json.get('id'),
                created_at = tweet_json.get('created_at', None),
                lang = tweet_json.get('lang', None),
                retweet_count = tweet_json.get('retweet_count', None),
                source = tweet_json.get('source', None),
                text = tweet_json.get('text', None)
            )
    
    graph.merge(node_tweet)
    graph.merge(Relationship(node_user, "TWEETED", node_tweet))
    
    if 'user_mentions' in tweet_json:
        for user_mention_json in tweet_json['user_mentions']:
            node_user_mencioned = parse_user(user_mention_json)
            graph.merge(Relationship(node_tweet, "MENCIONED", node_user_mencioned))
 
    if 'entities' in tweet_json:
        for entity in tweet_json['entities']:
            node_entity = Node("HashTag", hashtag = entity)
            graph.merge(node_entity)
            graph.merge(Relationship(node_tweet, "HASHTAG", node_entity))

    if 'retweeted_status' in tweet_json:
        node_user_retweeted = parse_user(tweet_json['retweeted_status']['user'])
        graph.merge(Relationship(node_tweet, "RETWEET_OF", node_user_retweeted))
        
        parse_tweet(tweet_json['retweeted_status'])

In [None]:
def load_file(tweets_data_path):
    tweets_file = open(tweets_data_path, "r")
    for index, tweet in enumerate(tweets_file):
        parse_tweet(json.loads(tweet))
        
        if index % 500 == 0:
            print(index)

In [None]:
load_file('../../data/tweets.json')

In [None]:
%%cypher
MATCH p = ((u:User {username : 'couchbase'})-[r:TWEETED]->(t))
RETURN u.username, t.text, type(r)
LIMIT 10

In [None]:
%%cypher match (n:HashTag)-[r]-() 
return n.hashtag, count(r) as degree 
order by degree desc
limit 10

In [None]:
%matplotlib inline

In [None]:
results = %cypher match (n:HashTag)-[r]-()  return n.hashtag as HashTag, count(r) as Degree order by Degree desc limit 10

In [None]:
results.get_dataframe()

In [None]:
results.pie()

In [None]:
results.plot()

In [None]:
results.bar()

In [None]:
from py2neo import Graph
graph = Graph()

In [None]:
query = """
    MATCH (h:HashTag)<-[:HASHTAG]-(:Tweet)-[:HASHTAG]->(HashTag {hashtag:"neo4j"}) 
    WHERE h.hashtag <> "neo4j"
    RETURN h.hashtag AS hashtag, COUNT(*) AS count
    ORDER BY count DESC
    LIMIT 10
"""

results = graph.run(query )
for d in results:
    print(d)



In [None]:
results = graph.run(
"""
MATCH (u:User)
WHERE exists(u.followers_count)
return distinct u.username, u.followers_count
order by u.followers_count DESC LIMIT 10
""")

for d in results:
    print(d)

In [None]:
type(results)

In [None]:
%%cypher
MATCH (u:User)
WHERE exists(u.followers_count)
return distinct u.username, u.followers_count
order by u.followers_count DESC LIMIT 10

In [None]:
%%cypher
match (n)
return distinct labels(n)

In [None]:
result = %cypher MATCH (hashtag:HashTag)<-[:HASHTAG]-(tweet:Tweet) \
                 RETURN hashtag.name AS hashtag, COUNT(tweet) AS tweets \
                 ORDER BY tweets DESC LIMIT 5
        
df = result.get_dataframe()
df.head()