# Aggregate usage of Hashtags

Using our Zenodo data set, we combine files to accummulate data over all 392 days and over smaller subintervals. We combine over smaller intervals in the temporal order of days and randomly. The shuffled cumulative data is used in the Heaps and Taylor's law analysis (Section IV) of paper. Accumlated data and randomised/shuffled data will be stored in a new folder called "Data".

In [None]:
import os, sys, codecs
import numpy as np
import pylab as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.cm as cm
import gzip
import pandas as pd
import random

In [None]:
sys.path.append(os.path.abspath(os.path.join(os.pardir,'src')))

from modules_distributor import fit
from general import *

In [None]:
day_path = "../Data/hashtags_frequency_day/"
hour_path = "../Data/hashtags_frequency_hour/"
min_path = "../Data/hashtags_frequency_minutes/"

In [None]:
# Getting all the zipped day files
list_of_files = []

list_of_files = get_zipped_files(day_path)
num_of_files = len(list_of_files)

# Combining the hashtag data from all 392 days in our data set

In [None]:
data_dict = {}

for i in range(num_of_files):
    hashtag_dict = get_data(day_path, list_of_files[i], "day")

    hashtags = list(hashtag_dict.keys())
    counts = list(hashtag_dict.values())

    for i in range(len(hashtags)):
        if hashtags[i] in data_dict.keys():
            data_dict[hashtags[i]] += counts[i] 
            # if the hashtag occured on a previous dy, we add on the count
        else:
            data_dict[hashtags[i]] = counts[i] 
            # if this is the first time the hashtag has occurred, we create a new key

Writing out results to a file

In [None]:
path_out = '../Data/'
fout_name = 'combined_data4' #the name of the output file

if not os.path.exists(path_out): # create folder if it doesn't already exist
    os.makedirs(path_out)
    
fout = open(path_out + fout_name + ".csv.gz", "w")

for hashtag, count in data_dict.items():
    fout.write(str(hashtag) + "," + str(count) + "\n")

fout.close()

# Combining data (cumulative data)

Here we gradually accumulate the data, starting from just the first day, then the first two days, first four days, first eight days. In general, the first 2^n days up until all our entire data set of 392 days/files.

In [None]:
# The intervals at which we accumulate. We can refine the intervals to make them smaller, 
# eg: first 1 day -> first 2 days -> first 3 days -> ... first n days
# but this will more compuationally and memory storage expensive.
given_intervals_temp = np.power(2, [0, 1, 2, 3, 4, 5, 6, 7 , 8])
given_intervals = np.concatenate(([0], given_intervals_temp, [num_of_files]))

# We export our results into a file, so we don't have to run the double for loop again
path_out = '../Data/Cumulative/day/'

# If this folder doesn't exist, we create it
if not os.path.exists(path_out): 
    os.makedirs(path_out)

In [None]:
# Function which combines the hashtag data from multiple files. We sum up the hashtag type usage
# across different files which are found in the path_in file path. The time_format should either
# be day, hour or minute. This is because the first column for days is formatted differently to
# minutes and hours, so the get_data() function behaves differently as well. The arguments start
# and given_intervals is the file/time when we start from and we accumulate until we have reached
# the time/number of files given by given_intervals. We write out the output a file path given by
# path_out and is in the format of the Zenodo data set. The hashtag followed by a comma and then
# the hashtag count.

def get_cum_data(path_in, files, time_format, given_intervals, start, path_out):
    # Indices of for loops are set such that we update data_dict based on the previous iteration, 
    # saving compuational time.
    data_dict = {}

    for i in range(0, len(given_intervals)-1):
        indices_i = range(0, given_intervals[i])

        # Combining hashtags and counts (cumulatively)
        for index in indices_i:
            hashtag_dict = get_data(path_in, files[i], time_format)
            hashtags = list(hashtag_dict.keys())
            counts = list(hashtag_dict.values())

            for j in range(len(hashtags)):
                if hashtags[j] in data_dict.keys():
                    data_dict[hashtags[j]] += counts[j] 
                    # if the hashtag occurred on a previous day, we add the counts
                else:
                    data_dict[hashtags[j]] = counts[j] 
                    # if this is the first time the hashtag occurred, we create a new key

        # writing results to desired folder and file
        fout_name = 'data_' + str(i) +  '_cum' + "_start_" + str(start) # name of output file
        fout = open(path_out + fout_name + ".txt", "w")

        for hashtag, count in data_dict.items():
            fout.write(str(hashtag) + "," + str(count) + "\n")
        fout.close()

In [None]:
# start accumulating from different start points
start_list = list(range(0, num_of_files - 16, 40))[1:10]

for start in start_list:
    get_cum_data(day_path, list_of_files, "day", given_intervals, start, path_out)

Finding type and token count using cumulative data

In [None]:
# Function counts the number of hashtag types and tokens in a given interval of time and a given
# start point. The interval of time can be on the minute, hour or day scale and is determined by 
# the path_in argument. The path_in argument is the source of the data and the path_out is the 
# folder where we write out our count. The output is a text file with the first column 
# corresponding to the type count and the second column is the token count.

def count_types_tokens(given_intervals, start, path_in, path_out):
    if not os.path.exists(path_out): 
        os.makedirs(path_out)
        
    fout = open(path_out + "types_tokens_count_start_" + str(start) + ".txt", "w")

    for i in range(len(given_intervals)-1):
        f_in_name = 'data_' + str(i) +  '_cum' #the name of the input file (contains the data)

        f = open(path_in + f_in_name + ".txt", "r")
        contents = f.read()
        f.close()

        res_list_temp = contents.split("\n")

        hashtags_temp = []
        counts_temp = []

        for j in range(len(res_list_temp)-1): # last element is empty, we cannot split it by ","
            split_result = res_list_temp[j].split(",")
            hashtags_temp.append(split_result[0])
            counts_temp.append(int(split_result[1]))

        fout.write(str(len(hashtags_temp)) + "," + str(sum(counts_temp)) + "\n")

    fout.close()

In [None]:
# Note: it would be more efficient to count types and tokens when we are actually accumulating above
given_intervals_temp = np.power(2, [0, 1, 2, 3, 4, 5, 6, 7 , 8])
given_intervals = np.concatenate(([0], given_intervals_temp, [392]))

path_in = '../Data/Cumulative/day/'
path_out = '../output/Cumulative/day/'
for start in start_list:
    count_types_tokens(given_intervals, start, path_in, path_out)

# Cumulative (minutes)

Accumulating using our minute data in temporal order. We start from various different initial points and compute the mean and standrad deviation. Will be used for Taylor's and Heaps' Law plots.

In [None]:
path_out = '../Data/Cumulative/min/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)

given_intervals = np.concatenate(([0], np.power(2, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))

# Getting all the zipped files
list_of_files = get_zipped_files(min_path)
num_of_files = len(list_of_files)

start_list = list(range(3000, num_of_files, 3000)) #change this to vary the start point

for start in start_list:
    get_cum_data(min_path, list_of_files, "min", given_intervals, start, path_out)

Finding type and token count using cumulative data

In [None]:
# Note: it would be more efficient to count types and tokens when we are actually accumulating above
path_in = '../Data/Cumulative/min/'
path_out = '../output/Cumulative/min/'

for start in start_list:
    count_types_tokens(given_intervals, start, path_in, path_out)

# Cumulative (hours)

In [None]:
path_out = '../Data/Cumulative/hour/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)

given_intervals = np.concatenate(([0], np.power(2, [0, 1, 2, 3, 4, 5, 6, 7])))

# Getting all hour files
list_of_files = get_zipped_files(hour_path)

In [None]:
start_list = list(range(1000, num_of_files, 1000)) #change this to vary the start point

for start in start_list:
    get_cum_data(hour_path, list_of_files, "hour", given_intervals, start, path_out)

Finding type and token count using cumulative data

In [None]:
path_in = '../Data/Cumulative/hour/'
path_out = '../output/Cumulative/hour/'

for start in start_list:
    count_types_tokens(given_intervals, start, path_in, path_out)

# Randomised cumulative

Accumulating our data based on a randomised order. Used for Taylor's and Heaps' Law plots.

In [None]:
random.seed(2020) # for repoducibility

# Randomised cumulative (minute)

In [None]:
path_out = '../Data/Random_Cumulative/min/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)
    
given_intervals = np.concatenate(([0], np.power(2, range(0, 12))))

list_of_files = get_zipped_files(min_path)
num_of_files = len(list_of_files)

N = 10 # number of different times we accumulate, used to find mean and SD

In [None]:
for start in range(N):
    random_order = random.sample(range(num_of_files), given_intervals[-1]) 
    # generate random integers so that we have enough to accumulate based on the final value of given_intervals.
    
    get_cum_data(min_path, list_of_files, "min", given_intervals, start, path_out)

Finding type and token count using cumulative data

In [None]:
path_in = '../Data/Random_Cumulative/min/'
path_out = '../output/Random_Cumulative/min/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)

In [None]:
for start in range(N):
    count_types_tokens(given_intervals, start, path_in, path_out)

# Randomised cumulative (hour)

In [None]:
path_out = '../Data/Random_Cumulative/hour/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)
    
given_intervals = np.concatenate(([0], np.power(2, range(0, 11))))

list_of_files = get_zipped_files(hour_path)
num_of_files = len(list_of_files)

N = 10 # number of different times we accumulate, used to find mean and SD

In [None]:
for start in range(N):
    random_order = random.sample(range(num_of_files), given_intervals[-1]) 
    # generate random integers
    
    get_cum_data(hour_path, list_of_files, "hour", given_intervals, start, path_out)

Finding type and token count using cumulative data

In [None]:
path_in = '../Data/Random_Cumulative/hour/'
path_out = '../output/Random_Cumulative/hour/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)

In [None]:
for start in range(N):
    count_types_tokens(given_intervals, start, path_in, path_out)

# Randomised cumulative (day)

In [None]:
path_out = '../Data/Random_Cumulative/day/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)
    
given_intervals = np.concatenate(([0], np.power(2, range(0, 9))))

list_of_files = get_zipped_files(day_path)
num_of_files = len(list_of_files)

N = 10 #number of times we accumulate in order to find mean/SD

In [None]:
for start in range(N):
    random_order = random.sample(range(num_of_files), given_intervals[-1]) # random integers
    get_cum_data(day_path, list_of_files, "day", given_intervals, start, path_out)

In [None]:
path_in = '../Data/Random_Cumulative/day/'
path_out = '../output/Random_Cumulative/day/'

if not os.path.exists(path_out): 
    os.makedirs(path_out)

In [None]:
for start in range(N):
    count_types_tokens(given_intervals, start, path_in, path_out)