In [192]:
# Requires pymongo 3.6.0+
from pymongo import MongoClient
import pandas as pd
import datetime
from bson.code import Code

client = MongoClient("mongodb://bigdata-mongodb-04.virtual.uniandes.edu.co:8087/", retryWrites=False)
database = client["Grupo03"]
collection = database["COL_tweets"]
collection_dataset = database["COL_dataset"]

In [133]:
query = {}
query["hashtags"] = {
    u"$gt": {
        u"$size": 0.0
    }
}


projection = {}
projection["created_at"] = 1.0
projection["hashtags"] = 1.0

cursor = collection.find(query, projection = projection)
data = []
try:
    for doc in cursor:
        for i in range(len(doc['hashtags'])):
            tweet_date = datetime.datetime.strptime(doc['created_at'], '%a %b %d %H:%M:%S %z %Y')
            data.append([doc['hashtags'][i]['text'].lower(), tweet_date])
finally:
    client.close()

1935

In [44]:
df = pd.DataFrame(data,columns=['hashtags', 'created_at'])
df.head()

Unnamed: 0,hashtags,created_at
0,ingresosolidario,2020-04-08 02:52:20+00:00
1,cuarentenaporlavida,2020-04-08 02:52:20+00:00
2,ingresosolidario,2020-04-08 02:27:11+00:00
3,cuarentenaporlavida,2020-04-08 01:26:13+00:00
4,alargarlacuarentenaes,2020-04-07 12:31:38+00:00


In [45]:
query = {}
query["replys.id"] = {
    u"$exists": True
}
query["replys.hashtags"] = {
    u"$ne": u""
}

projection = {}
projection["replys.hashtags"] = 1.0
projection["replys.created_at"] = 1.0

cursor = collection.find(query, projection = projection)
data = []
try:
    for doc in cursor:
        for i in range(len(doc['replys'])):
            if len(doc['replys'][i]['hashtags']) > 0:
                tweet_date = datetime.datetime.strptime(doc['replys'][i]['created_at'], '%a %b %d %H:%M:%S %z %Y')
                for j in range(len(doc['replys'][i]['hashtags'])):
                    data.append([doc['replys'][i]['hashtags'][j]['text'].lower(), tweet_date])

finally:
    client.close()

In [46]:
df = pd.DataFrame(data,columns=['hashtags', 'created_at'])
df.head()

Unnamed: 0,hashtags,created_at
0,ingresosolidario,2020-04-08 03:43:18+00:00
1,ingresosolidario,2020-04-08 03:20:32+00:00
2,ingresosolidario,2020-04-08 04:35:15+00:00
3,ingresosolidario,2020-04-08 04:04:44+00:00
4,ingreso,2020-04-08 04:00:07+00:00


In [47]:
query = {}
query["quotes.id"] = {
    u"$exists": True
}
query["quotes.hashtags"] = {
    u"$ne": u""
}

projection = {}
projection["quotes.hashtags"] = 1.0
projection["quotes.created_at"] = 1.0

cursor = collection.find(query, projection = projection)
#data = []
try:
    for doc in cursor:
        for i in range(len(doc['quotes'])):
            if len(doc['quotes'][i]['hashtags']) > 0:
                tweet_date = datetime.datetime.strptime(doc['quotes'][i]['created_at'], '%a %b %d %H:%M:%S %z %Y')                
                for j in range(len(doc['quotes'][i]['hashtags'])):
                    data.append([doc['quotes'][i]['hashtags'][j]['text'].lower(), date])
finally:
    client.close()

In [48]:
df = pd.DataFrame(data,columns=['hashtags', 'created_at'])
df.head()

Unnamed: 0,hashtags,created_at
0,ingresosolidario,2020-04-08 03:43:18+00:00
1,ingresosolidario,2020-04-08 03:20:32+00:00
2,ingresosolidario,2020-04-08 04:35:15+00:00
3,ingresosolidario,2020-04-08 04:04:44+00:00
4,ingreso,2020-04-08 04:00:07+00:00


## MAP REDUCE HASHTAGS

In [200]:
map = Code("function () {"
            "var hashtags_text = this.hashtags;"
            "var hashtags_date = this.created_at.split(' ');"
            "hashtags_text.forEach(function(z) {"
            "var text = z['text'].toLowerCase();"
            "emit(text.concat('#').concat(hashtags_date[1]).concat(hashtags_date[2]).concat(hashtags_date[5]), 1);"
            "});"
           
            "var hashtags_text_reply = this.replys;"
            "hashtags_text_reply.forEach(function(y) {"
            "var hashtags_date = y['created_at'].split(' ');"
            "var hashtags = y['hashtags'];"
            "hashtags.forEach(function(x) {"
            "var text = x['text'].toLowerCase();"           
            "emit(text.concat('#').concat(hashtags_date[1]).concat(hashtags_date[2]).concat(hashtags_date[5]), 1);"
            "});"           
            "});"    

            "var hashtags_text_quotes = this.quotes;"
            "hashtags_text_quotes.forEach(function(m) {"
            "var hashtags_date = m['created_at'].split(' ');"
            "var hashtags = m['hashtags'];"
            "hashtags.forEach(function(n) {"
            "var text = n['text'].toLowerCase();"           
            "emit(text.concat('#').concat(hashtags_date[1]).concat(hashtags_date[2]).concat(hashtags_date[5]), 1);"
            "});"           
            "});"           
            "}")

In [201]:
reduce = Code("function (key, values) {"
               "  var total = 0;"
               "  for (var i = 0; i < values.length; i++) {"
               "    total += values[i];"
               "  }"
               "  return total;"
               "}")

In [202]:
result = database.COL_tweets.map_reduce(map, reduce, "evol_hashtags")

In [261]:
collection = database["evol_hashtags"]

query = {}
projection = {}
projection["_id"] = 1.0
projection["value"] = 1.0

cursor = collection.find(query, projection = projection)
data = []
fecha = datetime.datetime.strptime('2020-04-01', '%Y-%m-%d')
try:
    for doc in cursor:
        dates = doc['_id'].split('#')
        tweet_date = datetime.datetime.strptime(dates[1], '%b%d%Y')
        if tweet_date >= fecha:
            data.append([dates[0], tweet_date, doc['value']])
finally:
    client.close()

df_hashtags = pd.DataFrame(data,columns=['hashtag', 'date', 'value'])
df_hashtags.head(100)

Unnamed: 0,hashtag,date,value
0,08abr,2020-04-09,1.0
1,09abril,2020-04-09,1.0
2,12abril,2020-04-14,1.0
3,6amhoyporhoy,2020-04-08,1.0
4,9abril,2020-04-09,1.0
...,...,...,...
95,blu4p0,2020-04-09,11.0
96,bogota,2020-04-09,1.0
97,bogotasolidariaencasa,2020-04-08,1.0
98,bogotasolidariaencasa,2020-04-20,1.0


In [253]:
df_hashtags = df_hashtags.sort_values(by=['date'])
newdf = df_hashtags.query('hashtag == "bogotasolidariaencasa" & date == "bogotasolidariaencasa"')
newdf.head()

Unnamed: 0,hashtag,date,value
156,bogotasolidariaencasa,2020-03-25,1.0
157,bogotasolidariaencasa,2020-03-26,16.0
158,bogotasolidariaencasa,2020-03-27,1.0
159,bogotasolidariaencasa,2020-03-31,2.0
152,bogotasolidariaencasa,2020-04-08,1.0


In [262]:
import plotly.express as px

fig = px.line(df_hashtags, x="date", y="value", color='hashtag')
fig.show()