In [1]:
from pymongo import MongoClient
from tqdm import tqdm
from bson.json_util import loads, dumps
import os.path
import numpy as np

In [3]:
# VPN must be active\n",

database = ''
client = MongoClient(
    host=[''],
    username="",
    password="",
    authSource='')
db = client[database]
print ("server version:", client.server_info()["version"])

collections = db.list_collection_names()
print(collections)
#print(db.command("collstats", "events"))

server version: 4.2.5
['ngrams', 'users', 'processed', 'tmp', 'all_tweets', 'processed_new', 'system.views', 'week_04061006', 'week_11061706', 'week_07051305', 'view_week1704', 'week_17042204', 'week_30040605', 'week_23042904', 'week_14052005', 'week_21052705', 'week_28050306']


In [4]:
collection = 'week_11061706'
json_file = 'json_collections/'+collection+'.json'

# download the collection locally
if not os.path.exists(json_file):
    file = open(json_file, "w")
    n_docs = db[collection].estimated_document_count()
    cursor = db[collection].find({})
    for document in tqdm(cursor, total=n_docs):
        file.write(dumps(document))
        file.write("\n")

#create a mongoDB locally
client = MongoClient('localhost', 27017)
db = client[database]
if collection not in db.list_collection_names():
    collection = db[collection]
    file_data = []
    count = 0
    for line in open(json_file).readlines(): count += 1
    for line in tqdm(open(json_file, 'r'),total=count):
        file_data.append(loads(line))
    collection.insert_many(file_data)

In [5]:
client = MongoClient('localhost', 27017)
db = client[database]
print ("server version:", client.server_info()["version"])

collections = db.list_collection_names()
print(collections)

server version: 4.2.5
['week_11061706']


## Sentiment profiles

In [6]:
import re
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
outdir = 'output/sentiment_profiles/'
collection = str(collections[0])

In [8]:
def words_match_regex(words_list):
    words_regex_objs = []
    for wd in words_list:
        regex_wd = '{}'.format(wd)
        words_regex_objs.append(re.compile(regex_wd, re.IGNORECASE))
    return(words_regex_objs)

def time_to_sec(hhmmss):
        [hours, minutes, seconds] = [int(x) for x in hhmmss.split(':')]
        x = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
        return(x.seconds/86400)

In [9]:
def create_profile(word,db,collection):
    
    pipeline = { 'complete_text': { '$in': words_match_regex(word) } }
    
    lst = []
    for doc in db[collection].find(pipeline):
        try:
            lst.append([doc['sentiment']['score'],doc['created_at']])
        except:
            pass
    
    df = pd.DataFrame.from_records(lst).dropna()
    df.columns = ['sentiment','date']
    df[['weekday','month','day','time','r1','r2']] = df['date'].str.split(' ',expand=True)
    sorter = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    sorterIndex = dict(zip(sorter,range(len(sorter))))
    df['weekday_id'] = df['weekday'].map(sorterIndex)

    sorter = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    sorterIndex = dict(zip(sorter,range(len(sorter))))
    df['month_id'] = df['month'].map(sorterIndex)

    df = df.sort_values(by=['month_id', 'day', 'weekday_id', 'time'])

    df['date_simple'] = df['weekday']+" "+df['month']+" "+df['day']

    sorter = df['date_simple'].unique()
    sorterIndex = dict(zip(sorter,range(len(sorter))))
    df['time_id'] = [ time_to_sec(i) for i in df['time'] ]
    df['time_axis'] = df['date_simple'].map(sorterIndex)+df['time_id']
    df['time_axis'] = df['time_axis'].round(4)

    df = df.sort_values(by='time_axis',ascending=True)
    df.drop(['weekday','month','day','time','r1','r2','weekday_id','month_id','time_id'], axis=1, inplace=True)

    H = 12
    c = [(j+1)/H for j in range(24)]
    lst = []
    for i in df['time_axis']:
        a = 'none'
        for j in c:
            if int(i)+j-(1/H) <= i <= int(i)+j:
                a = int(i)+j-(1/H)/2
        lst.append(a)
    df['time_bin'] = lst
    return(df)

def save_profile_png(df, word, outdir):
    sns.set(rc={'figure.figsize':(15, 5)})
    cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
    ax = sns.lineplot(x="time_bin", y="sentiment", data=df, linewidth=0.5, marker='o', linestyle='-', color='firebrick',ci=95)
    ax = sns.scatterplot(x="time_axis", y="sentiment", data=df, palette=cmap, alpha=.2, s=10, color='grey')
    ax.set_xticklabels('')
    ax.set_xlim(-0.5,len(df['date_simple'].unique()))
    ax.set_ylabel('Sentimient of tweets about COVID-19')
    ax.set_xlabel('')
    ax.set_title(word)
    ax.axhline(0, ls='--',color='black')

    k = 0
    for j in range(len(list(ax.get_xticks())[1:-1])): #set([int(i) for i in list(ax.get_xticks())[1:-1]]):
        i = list(ax.get_xticks())[1:-1][j]
        lab = df['date_simple'].unique()[int(i)]

        tot = df[df['date_simple']==lab].shape[0]
        posi = df[(df['date_simple']==lab)&(df['sentiment']>0)].shape[0]
        nega = df[(df['date_simple']==lab)&(df['sentiment']<0)].shape[0]
        percP = 'Sent. posi: '+"{:.2f}".format((posi/tot)*100)+'%'
        percN = 'Sent. nega: '+"{:.2f}".format((nega/tot)*100)+'%'
    
        plt.text(i+0.5,min(df['sentiment'])-0.15,lab,horizontalalignment='center')
        plt.text(i+0.5,min(df['sentiment'])-0.25,percP,horizontalalignment='center',fontsize=10)
        plt.text(i+0.5,min(df['sentiment'])-0.35,percN,horizontalalignment='center',fontsize=10)
        plt.axvline(i, 0, 1,color='grey', linestyle='dashed', linewidth=1)
        if j == 0:
            k = (list(ax.get_xticks())[1:-1][j+1]-i)/2
        #plt.axvline(i+k, 0, 1,color='grey', linestyle='dashed', linewidth=1)

    plt.savefig(outdir+'-'.join(word)+'.png',bbox_inches='tight')
    plt.close()

In [10]:
words = ["año","caso","crisis","gente","gobierno","gracias","madrid","medida","muerto","mundo","país","persona","riesgo","social","vida"]
#words = ['mascarilla','residencia','vacuna','confinamiento','desescalada','trabajo','estado de alarma','UCI','niños','deporte','fase']

for word in tqdm(words):
    df = create_profile([word],db,collection)
    save_profile_png(df,[word],outdir)

100%|██████████| 15/15 [05:23<00:00, 21.54s/it]


## Retweet impact

In [10]:
outdir = 'output/retweet_impact/'
collection = str(collections[0])

In [11]:
match = {
        'retweeted_status': {'$exists': 1}, # it must be a retweet
        'in_reply_to_status_id_str': {'$eq': None}, # it must not be a reply
        'is_quote_status': False # it must not be a quote
        }
pipeline = [{'$match': match}]

lst = []
for doc in db[collection].aggregate(pipeline, allowDiskUse=True):

    user_screen_name = doc['user']['screen_name']
    retweeted_status_id = doc['retweeted_status']['id']
    retweeted_user_screen_name = doc['retweeted_status']['user']['screen_name']

    L = [ user_screen_name, 
         retweeted_status_id,
         retweeted_user_screen_name ]

    lst.append(L)

df = pd.DataFrame.from_records(lst)
df.columns = [ 'user_screen_name',
              'retweeted_status_id',
              'retweeted_user_screen_name' ]

In [12]:
d_retweeted_tweets = df.groupby(['retweeted_user_screen_name'])['retweeted_status_id'].nunique().to_dict()
d_retweeting_users = df.groupby(['retweeted_user_screen_name'])['user_screen_name'].nunique().to_dict()

df_RI = pd.DataFrame()
df_RI['retweeted_user_screen_name'] = df['retweeted_user_screen_name']
df_RI['retweeted_tweets'] = df.retweeted_user_screen_name.map( d_retweeted_tweets )
df_RI['retweeting_users'] = df.retweeted_user_screen_name.map( d_retweeting_users )
df_RI['retweet_impact'] = df_RI['retweeted_tweets'] * np.log(df_RI['retweeting_users'])
df_RI = df_RI.sort_values(by=['retweet_impact'],ascending=False).drop_duplicates()
df_RI['retweet_impact'] = np.log10(df_RI['retweet_impact'])
df_RI = df_RI.replace([np.inf, -np.inf], np.nan).dropna()
df_RI.head(10).to_csv(outdir+'top10_retweet_impact.txt',index=None)

df_RI.hist('retweet_impact')
plt.savefig(outdir+'retweet_impact.png',bbox_inches='tight')
plt.close()

  result = getattr(ufunc, method)(*inputs, **kwargs)
