In [1]:
!python3 -m pip install -U networkx pandas numpy matplotlib plotly scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: networkx in /home/grudi2/.local/lib/python3.6/site-packages (2.5)
Requirement already up-to-date: pandas in /home/grudi2/.local/lib/python3.6/site-packages (1.1.1)
Requirement already up-to-date: numpy in /home/grudi2/.local/lib/python3.6/site-packages (1.19.1)
Requirement already up-to-date: matplotlib in /home/grudi2/.local/lib/python3.6/site-packages (3.3.1)
Requirement already up-to-date: plotly in /home/grudi2/.local/lib/python3.6/site-packages (4.9.0)
Requirement already up-to-date: scikit-learn in /home/grudi2/.local/lib/python3.6/site-packages (0.23.2)
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import datetime
import nltk
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import pytz
import re

from PIL import Image
from string import punctuation
from textwrap import fill, wrap

In [79]:
STOP_WORDS = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation))
STOP_WORDS.update(['corona', 'covid', '19']) # search terms
STOP_WORDS.update(['tá', 'pra', 'pq', ]) # words that shoudn't be here

DATA_DIR = os.path.expanduser('~/Dados/corona')
SITE_DIR = os.path.expanduser('~/Dados/covid_viz')
BULLETIN_DIR = os.path.join(SITE_DIR, 'boletins')

TIMEZONE = 'America/Sao_Paulo'

# Load dataset

In [4]:
def get_files_from_date(dir_, term, date, ext='csv'):
    if date.weekday() != 6:
        date -= datetime.timedelta(days=(date.weekday() + 1)) # a sunday

    ls = os.listdir(DATA_DIR)

    files = []
    for i in range(7):
        file_name = '{}_{}.{}'.format(term, date.strftime('%y%m%d'), ext)
        if os.path.exists(os.path.join(dir_, file_name)):
            files.append(file_name)
        date += datetime.timedelta(days=1)
    
    return files

def load_week_data(week_date):
    date = datetime.datetime.strptime(week_date, '%d/%m/%Y')
    date = date.replace(tzinfo=pytz.timezone(TIMEZONE))

    files = get_files_from_date(DATA_DIR, 'corona', date)
    files = [os.path.join(DATA_DIR, f) for f in files]
    if not len(files):
        return

    df = pd.concat([pd.read_csv(f, sep=';') for f in files], ignore_index=True)
    df.rename(
        columns={
            'user.screen_name': 'user',
            'retweeted_status.user.screen_name': 'retweeted_user',
            'complete_text': 'text'
        },
        inplace=True
    )

    df = df[df['lang'] == 'pt']
    df['date'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')
    df['date'] = df['date'].dt.tz_convert(TIMEZONE)
    df = df[df['date'] >= date].reset_index(drop=True)

    return df

In [5]:
df = load_week_data('16/08/2020')

print(df.shape)
print(df.head())

(691397, 6)
                       created_at       user   retweeted_user  \
0  Sun Aug 16 03:06:01 +0000 2020  caiocozta              NaN   
1  Sun Aug 16 03:06:02 +0000 2020     _gabge              NaN   
2  Sun Aug 16 03:06:02 +0000 2020  coronna__      lemes_kaiky   
3  Sun Aug 16 03:06:03 +0000 2020  sabrrinny  carol_bpinheiro   
4  Sun Aug 16 03:06:03 +0000 2020  _licemelo  carol_bpinheiro   

                                                text lang  \
0              @euotrouxa nem o corona        triste   pt   
1  Soros produzidos por cavalos têm anticorpos po...   pt   
2  Ultimamente eu só quero a vacina do corona é v...   pt   
3  eu🤡achando🤡que🤡o🤡corona🤡ia🤡 acabar🤡antes🤡do🤡 m...   pt   
4  eu🤡achando🤡que🤡o🤡corona🤡ia🤡 acabar🤡antes🤡do🤡 m...   pt   

                       date  
0 2020-08-16 00:06:01-03:00  
1 2020-08-16 00:06:02-03:00  
2 2020-08-16 00:06:02-03:00  
3 2020-08-16 00:06:03-03:00  
4 2020-08-16 00:06:03-03:00  


# Text functions

In [26]:
def remove_url(string):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', string)

def remove_puctuation(string):
    for p in punctuation:
        string = string.replace(p, '')
        
def remove_stop_words(words):
    return [word for word in words if not word in STOP_WORDS]

def wrap_text(text, wrap_width=100, sep='\n'):
    return sep.join(wrap(text, wrap_width))

# Interactive plots

In [7]:
def plot_line(x, y, **kwargs):
    fig = go.Figure(go.Scatter(x=x, y=y))

    fig.update_layout(
        xaxis_tickformat='%d/%m/%y',
        yaxis_rangemode='tozero',
        yaxis_ticksuffix=' mil',
        hovermode='x',
        **kwargs
    )

    return fig

def plot_bar(x, y, **kwargs):
    fig = go.Figure(go.Bar(x=x, y=y))

    fig.update_layout(
        hovermode='x',
        **kwargs
    )

    return fig

def export_html(df, figures):
    dates = df['date'].sort_values().iloc[[0, -1]].to_numpy()
    file_name = '{}.html'.format(dates[0].strftime('%y%m%d'))

    if dates[0].day != dates[1].day:
        periodo = '{} a {}'.format(dates[0].strftime('%d/%m/%Y'), dates[1].strftime('%d/%m/%Y'))
    else:
        periodo = '{}'.format(dates[0].strftime('%d/%m/%Y'))

    with open(os.path.join(BULLETIN_DIR, file_name), 'w') as f:
        f.write('<h1>Covid-19 Twitter {}</h1>'.format(periodo))
        f.write('<h2>Total de tweets coletados: {}</h2>'.format(df.shape[0]))
        for fig in figures:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))

    with open(os.path.join(SITE_DIR, 'index.html'), 'w') as f:
        f.write('<h1>Lista de boletins da Covid-19 no Twitter</h1>')

        for file_ in np.sort(os.listdir(BULLETIN_DIR)):
            with open(os.path.join(BULLETIN_DIR, file_)) as b:
                period = b.read().split('<h1>Covid-19 Twitter ')[1].split('</h1>')[0]
                f.write('<a href="boletins/{}">{}</a><br>'.format(file_, period))

In [8]:
def load_data_generate_html(week_dates):
    for week_date in week_dates:
        date = datetime.datetime.strptime(week_date, '%d/%m/%Y')
        date = date.replace(tzinfo=pytz.timezone(TIMEZONE))

        files = get_files_from_date(DATA_DIR, 'corona', date)
        files = [os.path.join(DATA_DIR, f) for f in files]
        if not len(files):
            continue

        df = pd.concat([pd.read_csv(f, sep=';') for f in files], ignore_index=True)
        df.rename(
            columns={
                'user.screen_name': 'user',
                'retweeted_status.user.screen_name': 'retweeted_user',
                'complete_text': 'text'
            },
            inplace=True
        )

        df = df[df['lang'] == 'pt']
        df['date'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')
        df['date'] = df['date'].dt.tz_convert(TIMEZONE)
        df = df[df['date'] >= date].reset_index(drop=True)

        date_series = df.groupby(df['date'].dt.date)['text'].count()
        tweet_dia = plot_line(
            date_series.index,
            date_series.values / 1000,
            title='Tweets por dia',
            xaxis_title='Dia',
            yaxis_title='Quantidade de Tweets',
        )

        top_tweet = df['text'].value_counts(ascending=False).index[0]
        top_tweet_series = df[df['text'] == top_tweet].groupby(df['date'].dt.date)['lang'].count()[:10]
        percurso = plot_line(
            top_tweet_series.index,
            top_tweet_series.values / 1000,
            title='Percurso top tweet',
            xaxis_title='Dia<br>Tweet: "{}"'.format(wrap_text(top_tweet, 100, '<br>')),
            yaxis_title='Quantidade de Retweets',
        )

        text_series = df['text'].value_counts(ascending=False)[:10]
        tweets_populares = plot_bar(
            [wrap_text(text, 50, '<br>') for text in text_series.index],
            text_series.values / 1000,
            title='Tweets mais populares',
            xaxis_title='Tweet',
            xaxis_showticklabels=False,
            yaxis_title='Quantidade de registros',
            yaxis_ticksuffix=' mil',
        )

        rt_user_series = df['retweeted_user'].dropna().value_counts(ascending=False)[:10]
        perfis_retweetados = plot_bar(
            rt_user_series.index,
            rt_user_series.values / 1000,
            title='Perfis mais retweetados',
            xaxis_title='Perfis',
            yaxis_title='Quantidade de Retweets',
            yaxis_ticksuffix=' mil',
        )

        user_series = df['user'].dropna().value_counts(ascending=False)[:10]
        perfis_ativos = plot_bar(
            user_series.index,
            user_series.values,
            title='Perfis mais ativos',
            xaxis_title='Perfis',
            yaxis_title='Quantidade de registros',
        )

        export_html(df, [
            tweet_dia, tweets_populares, percurso, perfis_retweetados,
            perfis_ativos
        ])

# Graph plot

In [84]:
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(series, max_features=10):
    series = series.apply(remove_url)
    vet = CountVectorizer(
        max_features=max_features,
        stop_words=STOP_WORDS
    )
    bow = vet.fit_transform(series)
    
    return bow, vet.get_feature_names()

def spmatrix_to_df(matrix, columns, index):
    return pd.DataFrame.sparse.from_spmatrix(matrix, columns=columns, index=index)
    
def bigrams(series):
    return series.apply(tokenize).apply(nltk.bigrams).apply(list).explode()

def tokenize(string):
    string = remove_url(string)
    return remove_stop_words(nltk.word_tokenize(string))

def text_to_graph(text_series, max_features=10):
    X, features = bag_of_words(text_series, max_features=max_features)
    adj_matrix = X.T * X
    graph_df = spmatrix_to_df(adj_matrix, features, features)
    
    return nx.draw_networkx(nx.convert_matrix.from_pandas_adjacency(graph_df))

def plot_graph(corpus, node_size_factor=30, n_nodes=15, **kwargs):
    X, features = bag_of_words(corpus, max_features=n_nodes)
    adj_matrix = X.T * X
    adj_matrix.setdiag(0)
    adj_matrix = adj_matrix / adj_matrix.max()
    graph_df = spmatrix_to_df(adj_matrix, features, features)
    g = nx.convert_matrix.from_pandas_adjacency(graph_df)

    pos = nx.spring_layout(g)
    xn, yn = np.array(list(pos.values())).T
    xe = []
    ye = []
    for e in g.edges:
        xe += [pos[e[0]][0], pos[e[1]][0], None]
        ye += [pos[e[0]][1], pos[e[1]][1], None]

    node_size = adj_matrix.sum(axis=1) * node_size_factor
    axis = {'showline': True, 'zeroline': False, 'showgrid': False, 'showticklabels': False, 'title': ''}

    fig = go.Figure(
        data=[
            go.Scatter(
                x=xn, y=yn,
                mode='markers+text', marker_size=node_size,
                text=list(g.nodes), textposition="middle center",
                hoverinfo='none', #TODO: number of records
            ),
            go.Scatter(x=xe, y=ye, mode='lines', line_color='rgba(151, 151, 151, 0.2)', hoverinfo='none')
        ],
        layout=go.Layout(xaxis=axis, yaxis=axis, showlegend=False, **kwargs)
    )
    
    return fig

In [10]:
text = df['text'].dropna().str.lower()
word_list = text.str.split().apply(remove_stop_words)
single_word_list = text.drop_duplicates().str.split().apply(remove_stop_words)

## Feature extraction

In [85]:
plot_graph(text, n_nodes=15, title='Conexão das palavras')

## bigrams

In [49]:
bigrams = bigrams(df['text'])