# Fitting rank-frequency distribution of Hashtags

In this notebook we describe Zipf's law by the 8 generalized models in section III of the paper.

We fit the models using our original Zenodo data set and the cumulative data sets in the "Data" folder (obtain these by running the aggregate_data notebook). We write out our fitted results such as parameter estimates and log likelihoods to the "output" folder.

In [1]:
import os, sys, codecs
import numpy as np
import pylab as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.cm as cm
import gzip
import pandas as pd

In [2]:
sys.path.append(os.path.abspath(os.path.join(os.pardir,'src')))

from modules_distributor import fit
from general import *

In [3]:
path_out = '../output/'

In [4]:
# See paper for exact details regarding the models
models = ['simple', 'double_powerlaw', 'lognormal', 'naranan', 'expcutoff', 'weibull', 'shifted', 'double_2gammas']

# Small dataset 
(1st day, only Hashtags with more than 100 appearances)

In [5]:
path = "../Data/"
fout_name = "combined_data_firstDay.csv" # name of the output file that contains combined data for 392 days

f = open(path + fout_name, "r")
contents = f.read()
f.close()  

res_list_temp = contents.split("\n")

all_counts = []

for i in range(len(res_list_temp)-1): # last element is empty, we cannot split it by ","
    split_result = res_list_temp[i].split(",")
    all_counts.append(int(split_result[1]))

Fitting the different models to the small dataset, output is:

([Model Parameters], -log(Likelihood), number of success of fit)

In [7]:
nrep = 10 # number of repetitions

for model in models:
    res = fit(model = model, counts = all_counts, nrep = nrep)
    print(model + "\n" + str(res))


simple
([1.1533865383467519], 8.846853420042262, 10)


  C = zeta_minmax(gamma1,kmin,km+1) + km**(gamma2-gamma1)*zeta_minmax(gamma2,km+1,kmax)
  C = zeta_minmax(gamma1,kmin,km+1) + km**(gamma2-gamma1)*zeta_minmax(gamma2,km+1,kmax)


double_powerlaw
([75.76997432617122, 13265.578857606195], 8.280642728782166, 10)
lognormal
([5.948848643283158, 2.401872522607375], 8.249867348529552, 10)
naranan
([1.2519828071518573, 5.907264755700497], 8.558211699831606, 10)
expcutoff
([0.6872449468460942, 0.00016133186370129277], 8.156616332700626, 10)
weibull
([-0.30815327144784604, 3.795951196760681], 9.465460836853136, 10)
shifted
([1.6078258074448946, 180.78218712386004], 8.34313356097208, 10)
double_2gammas
([0.8080437065556514, 24.093825188343484, 12430.672293067993], 8.145755505287767, 10)


# Combined data for all 392 days

In [None]:
path = "../Data/"
fout_name = "combined_data.csv.gz" # name of the output file that contains combined data for 392 days

f = open(path + fout_name, "r")
contents = f.read()
f.close()  

res_list_temp = contents.split("\n")

In [None]:
all_counts = []

for i in range(len(res_list_temp)-1): # last element is empty, we cannot split it by ","
    split_result = res_list_temp[i].split(",")
    all_counts.append(int(split_result[1]))

The output here only shows the fitted results after 1 repetition. i.e: nrep = 1.

In [None]:
nrep = 10 # number of repetitions

# Writing results to a file
fout = open(path_out + "combined_data" + "-nrep" + str(nrep) + ".txt", "w")

for model in models:
    res = fit(model = model, counts = all_counts, nrep = nrep)
    print(model + "\n" + str(res))
    fout.write(str(res) + "\n")
    
fout.close()

# Cumulative data

Consecutive days in temporal order. Eg: 2 days in a row, 7 days in a row.

In [None]:
path_in = '../Data/Cumulative/day/'
folder_out = 'Cumulative/Zipf_fitted/'

# IMPORTANT: These need to match the given_intervals in aggregate_data.ipynb
given_intervals_temp = np.power(2, [0, 1, 2, 3, 4, 5, 6, 7 , 8])
given_intervals = np.concatenate(([0], given_intervals_temp, [num_of_files]))

In [None]:
if not os.path.exists(path_out + folder_out): 
        os.makedirs(path_out + folder_out) #if the path doesn't exist, we create it. Otherwise, we carry on.


for i in range(len(given_intervals)-1):
    f_in_name = 'data_' + str(i) +  '_cum' # name of the input file (contains the data)

    f = open(path_in + f_in_name + ".txt", "r")
    contents = f.read()
    f.close()

    res_list_temp = contents.split("\n")
    
    hashtags_temp = []
    counts_temp = []

    for j in range(len(res_list_temp)-1):
        split_result = res_list_temp[j].split(",")
        hashtags_temp.append(split_result[0])
        counts_temp.append(int(split_result[1]))
        
    # Model fitting and writing results to a file
    nrep = 10

    fout = open(path_out + folder_out + "fit" + str(i) + "_cum" +  "-nrep" + str(nrep) + ".txt", "w")

    for model in models:
        res = fit(model = model, counts = counts_temp, nrep = nrep)
        fout.write(str(res) + "\n")
    
    fout.close()

Results are easily extended to fitting models for every day, minute or hour. We can use data from the original Zenodo data base and from the "Data" folder.