### 39. Methodenseminar
## Big Data Module II: Introduction to Social Network Science with Python
# 3.2 Scale-Free Networks (Demo)
**Author**: <a href='https://www.gesis.org/person/haiko.lietz'>Haiko Lietz</a>, GESIS - Leibniz Institute for the Social Sciences

**Date**: 17 July 2019

**Library versions**: ``networkx`` 2.2 ([documentation](https://networkx.github.io/documentation/))

***
## Table of Contents
#### [3.2.1 Fitting Degree Distributions](#3_2_1)
#### [3.2.2 Alternative Distributions](#3_2_2)
#### [3.2.3 Plausibility of Power Law Fit](#3_2_3)
#### [3.2.4 Citation In Social Network Science](#3_2_4)
[3.2.4.1 Fitting the Citation Distribution](#3_2_4_1) |
[3.2.4.1 Measuring Preferential Attachment](#3_2_4_2)
***

## 3.2.1 Fitting Degree Distributions <a name='3_2_1'></a>

#### Erdős-Rényi Graph At Phase Transition

#### Barabási–Albert Model From Preferential Attachment

#### Estimation of Lower Cutoff

### 3.2.2 Alternative Distributions <a name='3_2_2'></a>

In [1]:
fit_er.plot_pdf(marker='o', ls='')
fit_er.exponential.plot_pdf(label='Exponential')
fit_er.stretched_exponential.plot_pdf(label='Stretched Exponential')
fit_er.lognormal_positive.plot_pdf(label='Lognormal')
fit_er.power_law.plot_pdf(label='Power Law')
fit_er.truncated_power_law.plot_pdf(label='Truncated Power Law')
plt.legend()

NameError: name 'fit_er' is not defined

In [None]:
fit_ba.plot_pdf(marker='o', ls='')
#fit_ba.exponential.plot_pdf(label='Exponential')
fit_ba.stretched_exponential.plot_pdf(label='Stretched Exponential')
fit_ba.lognormal_positive.plot_pdf(label='Lognormal')
fit_ba.power_law.plot_pdf(label='Power Law')
fit_ba.truncated_power_law.plot_pdf(label='Truncated Power Law')
plt.legend()

#### Identifying the Best Fit

In [None]:
def compare_functions(f):
    function = ['exponential', 'stretched_exponential', 'lognormal', 'lognormal_positive', 'power_law', 'truncated_power_law']
    from numpy import zeros
    f_compare_R = zeros((6, 6), dtype=float)
    f_compare_p_R = zeros((6, 6), dtype=float)
    for i in range(0, 6):
        for j in range(0, 6):
            R, p_R = f.distribution_compare(function[i], function[j])
            f_compare_R[i, j] = R
            f_compare_p_R[i, j] = p_R
    from pandas import DataFrame
    return DataFrame(f_compare_R, index=function, columns=function), DataFrame(f_compare_p_R, index=function, columns=function)

### 3.2.3 Plausibility of Power Law Fit <a name='3_2_3'></a>

In [None]:
def p_value(f, sims=2500):
    prob = f.n_tail/len(f.data_original)
    body = [x for x in f.data_original if x < f.xmin]
    l = []
    from random import random, sample
    from powerlaw import Fit, Power_Law
    for i in range(0, sims):
        x = []
        for j in range(0, len(f.data_original)):
            if random() <= prob:
                x.append(int(Power_Law(discrete=True, xmin=f.xmin, parameters=[f.power_law.alpha]).generate_random(1)))
            else:
                x.append(sample(body, 1)[0])
        x_fit = Fit(x, discrete=True).power_law
        l.append(x_fit.KS() > f.power_law.KS())
    p = sum(l)/sims
    return p

In [None]:
p_value(fit_er, sims=10)

In [None]:
p_value(fit_ba, sims=10)

### 3.2.4 Citation In Social Network Science <a name='3_2_4'></a>

In [None]:
import pandas as pd

In [None]:
citations = pd.read_csv('../data/sns/citations.txt', header='infer', delimiter='\t', encoding='utf-8')
references = pd.read_csv('../data/sns/references.txt', header='infer', delimiter='\t', encoding='utf-8')
cited_references = pd.merge(left=citations, right=references, on='reference_id')
cited_references = cited_references.groupby('reference').size().reset_index(name='citations')
cited_references = cited_references.sort_values('citations', ascending=False)
cited_references.head()

#### 3.2.4.1 Fitting the Citation Distribution <a name='3_2_4_1'></a>

#### 3.2.4.1 Measuring Preferential Attachment <a name='3_2_4_1'></a>

In [None]:
import numpy as np

In [None]:
citations = pd.read_csv('../data/sns/citations.txt', header='infer', delimiter='\t', encoding='utf-8')
references = pd.read_csv('../data/sns/references.txt', header='infer', delimiter='\t', encoding='utf-8')
cited_references = pd.merge(left=citations, right=references, on='reference_id')
publications = pd.read_csv('../data/sns/publications.txt', header='infer', delimiter='\t', encoding='utf-8')
publications['time'] = (3*np.floor(publications['time']/3)+2).astype('int')
cited_references_time = pd.merge(left=cited_references, right=publications[['publication_id', 'time']], on='publication_id')
cited_references_time = cited_references_time.groupby(['time', 'reference']).size().reset_index(name='citations')
cited_references_time.head()

In [None]:
t = 27
cited_references_t_0 = cited_references_time[cited_references_time['time'] == years[t-1]]
cited_references_t = cited_references_time[cited_references_time['time'] == years[t]]
preferential_attachment = pd.merge(left=cited_references_t_0, right=cited_references_t, on='reference')
preferential_attachment.columns = ['time_0', 'reference', 'citations_0', 'time', 'citations']
preferential_attachment = preferential_attachment[['time_0', 'time', 'reference', 'citations_0', 'citations']]
preferential_attachment.head()

#### Fitting A Scaling Law

In [None]:
def ols_reg(a):
    # log and reshape data
    x_log10 = np.log10(a[:, 0])
    x_log10_reshape = x_log10.reshape(len(x_log10), 1)
    y_log10 = np.log10(a[:, 1])
    y_log10_reshape = y_log10.reshape(len(y_log10), 1)
    # fit linear model in log space
    import sklearn.linear_model as sk_lm
    reg = sk_lm.LinearRegression()
    reg.fit(x_log10_reshape, y_log10_reshape)
    y_log10_reshape_predict = reg.predict(x_log10_reshape)
    # create output
    x_min = min(a[:, 0])
    x_max = max(a[:, 0])
    d = 10**reg.intercept_[0]
    beta = reg.coef_[0][0]
    from sklearn.metrics import r2_score
    r2 = r2_score(y_log10_reshape, y_log10_reshape_predict)
    a_fit = np.array([[x_min, d*x_min**beta], [x_max, d*x_max**beta]])
    return beta, r2, a_fit

#### Fitting A Scaling Law On Averaged Data

In [None]:
plt.scatter(a[:, 0], a[:, 1])
plt.scatter(a_mean[:, 0], a_mean[:, 1])
plt.plot(a_fit_mean[:, 0], a_fit_mean[:, 1])
plt.xscale('log')
plt.yscale('log')