In [1]:
import pymongo 
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
import json

In [2]:
# connect to database
connection = MongoClient('localhost', 27017)

In [3]:
# Borramos la base de datos
connection.drop_database('twitter')

# Pymongo permite una sintaxis practicamente igual que la original
db = connection.twitter

In [4]:
def insert_tweet(tweet_json):
   
    # Simplificamos la estructura user_mentions a un array de nombres de usuario
    user_mencioned_list = []
    for user_mencioned in tweet_json['user_mentions']:
        user_mencioned_list.append(user_mencioned["screen_name"])
        user_mencioned["_id"] = user_mencioned["screen_name"]
        try:
            db.users.insert_one(user_mencioned)
        except DuplicateKeyError:
            continue
    tweet_json['user_mentions'] = user_mencioned_list
                 
    # Creamos un documento por hashtag utilizando el operador $inc para incrementar el contador
    for entity in tweet_json['entities']:
        db.hashtags.find_and_modify(query = {"_id" : entity}, 
                                    update ={ "$inc": { "count": 1 } } , 
                                    upsert = True)
       
    
    #Utilizamos como _id de la colección el ID del Tweet
    db.tweets.replace_one({"_id" : tweet_json["id_str"]}, tweet_json, upsert=True)
    
    #Insertamos el documento dentro de la colección de usuarios, utilizando como _id el campo screen_name
    user_json = tweet_json['user']
    db.users.replace_one({"_id" : user_json["screen_name"]}, user_json, upsert=True) 

    
    #En el caso de que el Tweet tenga un Tweet padre (retweet) lo almenamos como documento independiente 
    if 'retweeted_status' in tweet_json:
        insert_tweet(tweet_json['retweeted_status'])

In [5]:
tweets_data_path = '../data/tweets.json'

tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    tweet_json = json.loads(line)
    insert_tweet(tweet_json)


![png](./images/Mogodb%20Model.png)

## Contando el número de registros

In [6]:
print "Número de Tweets: " , db.tweets.count()
print "Número de Usuarios: " , db.users.count()
print "Número de Hashtags: " , db.hashtags.count()

Número de Tweets:  2800
Número de Usuarios:  1387
Número de Hashtags:  629


## Encontrar un Tweet que tenga Hashtags

In [7]:
db.tweets.find_one({"entities.1" : {"$exists" : True }})

{u'_id': u'658171535824416768',
 u'created_at': u'Sun Oct 25 06:41:37 +0000 2015',
 u'entities': [u'SQL', u'NoSQL'],
 u'favorite_count': 0,
 u'id': 658171535824416768L,
 u'id_str': u'658171535824416768',
 u'lang': u'en',
 u'retweet_count': 1,
 u'retweeted_status': {u'created_at': u'Sun Oct 25 06:30:02 +0000 2015',
  u'entities': [u'SQL', u'NoSQL'],
  u'favorite_count': 0,
  u'id': 658168624692228096L,
  u'id_str': u'658168624692228096',
  u'lang': u'en',
  u'retweet_count': 1,
  u'source': u'<a href="http://bufferapp.com" rel="nofollow">Buffer</a>',
  u'text': u'Coming Full Circle: Why #SQL now powers the #NoSQL Craze, by Ryan Betts of @VoltDB https://t.co/sj11zLPgEX',
  u'user': {u'created_at': u'Thu Jan 30 09:46:50 +0000 2014',
   u'description': u'We cover #BigData, #FinTech and #IoT - Looking at how data science and connected devices are changing technology.',
   u'favourites_count': 1425,
   u'followers_count': 8801,
   u'friends_count': 2505,
   u'geo_enabled': True,
   u'id': 23

## Usuarios más populares

In [8]:
cursor_users = db.users.find({"followers_count": {"$exists" : True }}).sort("followers_count", pymongo.DESCENDING).limit(10)

for user in cursor_users:
    print user["screen_name"], "-", user["followers_count"]

googlecloud - 447590
Azure - 383977
craigbrownphd - 368570
DavidPapp - 215114
couchbase - 160978
iamdevloper - 125526
analyticbridge - 117146
docker - 91037
javacodegeeks - 79307
developerWorks - 75534


## Usuarios que más tweets tienen

In [9]:
cursor = db.tweets.aggregate([
        {"$group" : { "_id" : "$user.screen_name", "count" : { "$sum" : 1}}},
        {"$sort" : {"count" : -1}},
        {"$limit" : 10}
    ])

for row in cursor:
    print row["_id"],"-", row["count"]

Dev_Topics - 208
BigDataTweetBot - 147
geneolot - 135
ameanmbot - 102
ClearGrip - 97
retweetjava - 72
vikasjee - 53
NoSqlRR - 38
Pvalsfr - 37
vermanivivek - 24


## Hashtags más populares

In [10]:
for hashtag in db.hashtags.find({}).sort("count", pymongo.DESCENDING).limit(10):
    print hashtag["_id"], "-", hashtag["count"]


NoSQL - 1830
BigData - 843
Java - 794
MongoDB - 776
SoapUi - 756
Hadoop - 703
nosql - 606
hive - 473
bigdata - 268
MongoDb - 171


## Tweets de un determinado Hashtag

In [11]:
for tweet in db.tweets.find({ "entities" : "NoSQL"}).limit(10):
    print tweet["user"]["screen_name"], "-", tweet["text"]

MAGISTR_OM - RT @DataconomyMedia: Coming Full Circle: Why #SQL now powers the #NoSQL Craze, by Ryan Betts of @VoltDB https://t.co/sj11zLPgEX
DataconomyMedia - Coming Full Circle: Why #SQL now powers the #NoSQL Craze, by Ryan Betts of @VoltDB https://t.co/sj11zLPgEX
jppastor - Four and a Half Types of #NoSQL Databases and when to use them https://t.co/yZuIPpte3D #plevycom  “@plevy
DutGRG - RT @javacodegeeks: #MongoDB Tutorial: A Scalable #NoSQL DB - MEGA Course https://t.co/sK6twnkBqG https://t.co/42ebFFLNiu
javacodegeeks - #MongoDB Tutorial: A Scalable #NoSQL DB - MEGA Course https://t.co/sK6twnkBqG https://t.co/42ebFFLNiu
setechec - RT @javacodegeeks: #MongoDB Tutorial: A Scalable #NoSQL DB - MEGA Course https://t.co/7nz9NgHFnK https://t.co/EQYQ862oTY
javacodegeeks - #MongoDB Tutorial: A Scalable #NoSQL DB - MEGA Course https://t.co/7nz9NgHFnK https://t.co/EQYQ862oTY
xMAnton - RT @javacodegeeks: #MongoDB Tutorial: A Scalable #NoSQL DB - MEGA Course https://t.co/sK6twnkBqG https://t.co

## Tweets de un determinado usuario

In [12]:
for tweet in db.tweets.find({ "user.screen_name" : "MAGISTR_OM"}).limit(10):
    print tweet["user"]["screen_name"], "-", tweet["text"]

MAGISTR_OM - RT @DataconomyMedia: Coming Full Circle: Why #SQL now powers the #NoSQL Craze, by Ryan Betts of @VoltDB https://t.co/sj11zLPgEX


## Usuarios mas mencionados

In [13]:
cursor = db.tweets.aggregate([
        {"$unwind" : "$user_mentions"},
        {"$group" : { "_id" : "$user_mentions", "count" : { "$sum" : 1}}},
        {"$sort" : {"count" : -1}},
        {"$limit" : 10}
    ])

for row in cursor:
    print row["_id"],"-", row["count"]

geneolot - 168
couchbase - 100
aerospikedb - 64
ClearGrip - 54
patio11 - 52
SiliconArmada - 45
vermanivivek - 34
javacodegeeks - 32
McMcgregory - 30
infoworld - 29


## Tweets donde se menciona a un usuario

In [14]:
for tweet in db.tweets.find({ "user_mentions" : "couchbase"}).limit(10):
    print tweet["user"]["screen_name"], "-", tweet["text"]

javapsyche - RT @couchbase: 7 reasons why companies are switching from #MongoDB to #Couchbase: https://t.co/0b9yVl14bb #NoSQL #database
P_O_Bourge - RT @couchbase: 7 reasons why companies are switching from #MongoDB to #Couchbase: https://t.co/0b9yVl14bb #NoSQL #database
AlessioCavone - RT @FelixLGriffin: 7 reasons companies switch from MongoDB to #Couchbase https://t.co/FWdMm86JxE @couchbase #opensource #NoSQL #PaaS #SaaS …
FelixLGriffin - 7 reasons companies switch from MongoDB to #Couchbase https://t.co/FWdMm86JxE @couchbase #opensource #NoSQL #PaaS #SaaS #MaaS #IoT #iTPete
jlamigueiro - RT @FelixLGriffin: 7 reasons companies switch from MongoDB to #Couchbase https://t.co/FWdMm86JxE @couchbase #opensource #NoSQL #PaaS #SaaS …
ameanmbot - RT @FelixLGriffin: 7 reasons companies switch from MongoDB to #Couchbase https://t.co/FWdMm86JxE @couchbase #opensource #NoSQL #PaaS #SaaS …
OHughe5 - RT @couchbase: 7 reasons why companies are switching from #MongoDB to #Couchbase: https://t.co/0b9