In [None]:
import os
import re
import time
import requests
import itertools
import numpy as np
import pandas as pd

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

from graphviz import *
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
def load_dfs(word_list, date):
    df = pd.DataFrame()
    for word in word_list:
        dfaux = pd.read_csv( date +'\/' + word + '.csv'     )
        dfaux['word'] = word
        df = df.append(dfaux)
    return df.reset_index(drop=True)


def get_hashtag(string):
    return re.findall(r'\#\w+', string)


def get_intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))


def build_graph(word_list,edges,filename):
#    g = Digraph('G', filename=filename + '.gv', engine='sfdp', format='png')
    g = Graph(  'G', filename=filename + '.gv', engine='sfdp', format='png')

    for node in word_list:
        g.node(node)
    for edge in edges:
        g.edge(edge[0],edge[1], len='2.00')
    return g


def get_directional_edges(df, combos):
    edges     = []
    for pair in combos:
        len1 = len(df[(df['word'] == pair[0]) & df['tweet.text'].str.contains('#' + pair[1])])
        len2 = len(df[(df['word'] == pair[1]) & df['tweet.text'].str.contains('#' + pair[0])])
        if len1 > len2:
            edges.append((pair[0],pair[1]))
        else:
            edges.append((pair[1],pair[0]))        
    return edges


def get_nodes_edges_treshold(df, nnodes, threshold):
    # gets the list of the nnodes most used hashtags in the df
    # this is the list of nodes one wants in the graph
    dfaux      = df.copy()
    dfaux['#'] = dfaux.apply(lambda x: get_hashtag(x['tweet.text']), 1)
    merged     = pd.Series(list(itertools.chain(*dfaux['#'].tolist()))).value_counts()[0:nnodes]

    # for each of these hashtags (nodes) filters the df rows containing that hashtag
    # and check which other hashtags are presente in the filtered df
    # finally it counts the number of tweets with the second hashtag and check
    # if a threshold is passed (this identify the edges).
    edges = []
    for node in merged.keys():
        aux   = list(itertools.chain(*(dfaux[dfaux['tweet.text'].str.contains(node)]['#'].tolist())))
        aux   = get_intersection(merged.keys(),aux)
        aux.remove(node)
        for a in aux:
            if ((node[1:],a[1:]) not in edges) and ((a[1:],node[1:]) not in edges):
                if len(dfaux[dfaux['tweet.text'].str.contains(a)]) > threshold:
                    edges.append((node[1:],a[1:]))
    nodes = [x[1:] for x in merged.keys()]
    return nodes, edges


def clean_url(url):
    if '.ly/' in url:
        try:
            site = requests.get(url)
            return site.url
        except:
            return url
    else:
        return url


def get_list_of_hashtags(df):
    dfaux      = df.copy()
    dfaux['#'] = dfaux.apply(lambda x: get_hashtag(x['tweet.text']), 1)
    merged     = pd.Series(list(itertools.chain(*dfaux['#'].tolist()))).value_counts()
    return merged.keys()


# Graph with words in "semantic circle"

In [None]:
word_list = ['furniture', 'homedecor', 'interiordesign']
combos    = list(itertools.combinations(word_list,2))
date      = '2019-10-15'

df_tweets = load_dfs(word_list, date).fillna(0)
edges     = get_directional_edges(df_tweets, combos)
g         = build_graph(word_list,edges,'test_0.1')
g
#g.view()

# Graph with more frequent hashtags 

In [None]:
nnodes       = 10
threshold    = 1000
word_list = ['furniture', 'homedecor', 'interiordesign']

df_tweets = load_dfs(word_list, date).fillna(0)
df_tweets.drop_duplicates( subset=['tweet.id'], keep=False, inplace=True)
nodes, edges = get_nodes_edges_treshold(df_tweets, nnodes, threshold)
g            = build_graph(nodes,edges,'test_0.2')
g
g.view()

# Cloud of words with most frequent words in tweets' texts

In [None]:
# Cloud of words from: https://www.datacamp.com/community/tutorials/wordcloud-python
word_list = ['furniture', 'homedecor', 'interiordesign']
df_tweets = load_dfs(word_list, date).fillna(0)
df_tweets = df_tweets[df_tweets.duplicated(subset=['tweet.id'])].copy()
text      = " ".join(review for review in df_tweets['tweet.text'])

stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords,
                      max_font_size=50,
                      max_words=100,
                      background_color="white").generate(text)

plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file("first_words_cloud.png")

# Frequency of most frequent hashtags

In [None]:
word_list = ['furniture', 'homedecor', 'interiordesign']
df_tweets                     = load_dfs(word_list, date).fillna(0)
df_tweets                     = df_tweets[df_tweets.duplicated(subset=['tweet.id'])]
df_tweets['tweet.created_at'] = pd.to_datetime(df_tweets['tweet.created_at'])
df_tweets['time_buckets']     = df_tweets.apply(lambda x: x['tweet.created_at'].hour, 1)


nhashtags = 10
rows = []
hashtag_list = get_list_of_hashtags(df_tweets)
for hashtag in hashtag_list[0:nhashtags]:
    for h in range(24):
        aux = df_tweets[(df_tweets['time_buckets'] == h) & (df_tweets['tweet.text'].str.contains(hashtag))]
        row = [hashtag,h,len(aux)]
        rows.append(row)


df_freq = pd.DataFrame(rows)
df_freq = df_freq.rename({0: 'hashtag', 1: 'hour', 2: 'frequency'}, axis='columns')

plt.figure(figsize=(15,10))

for hashtag in merged.keys():
    plt.plot(np.arange(24),df_freq[df_freq['hashtag'] == hashtag]['frequency'], label=hashtag)
plt.ylabel('# of Tweets with hashtag')
plt.xlabel('Time of day')
plt.legend()
plt.show()