In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import community as community_louvain
import networkx.algorithms.community as nx_com
from sklearn import preprocessing
import matplotlib.colors as colors
import matplotlib.cm as cmx

## Information for ID in ***amazon-meta.txt***

In [None]:
fname = "amazon-meta.txt"
with open(fname, encoding = 'utf8') as f:
    content = f.readlines()
#Remove the beginning and trailing white spaces.
content = [x.strip() for x in content] 

In [None]:
content

In [None]:
file = open("preprocess.csv","w", encoding='utf8')
previouslines = ['Id', 'title', 'group', 'categories', 'totalreviews', 'avgrating']
for line in content:
    lines = line.split(':')
    if lines[0] == "Id":
        if (len(previouslines) == 6):
            for component in previouslines[0:5]:
                file.write(component)
                file.write(',')
            file.write(previouslines[5])
            file.write("\n")
        previouslines = []
        previouslines.append(lines[1].strip())
        
    if lines[0] == "title":
        title = ':'.join(lines[1:]).strip().replace(',', ' ').replace('\n', ' ').strip()
        previouslines.append(title)
       
    if lines[0] == "group":
        previouslines.append(lines[1].strip())

    if lines[0] == "categories":
        previouslines.append(lines[1].strip())
    
    if lines[0] == "reviews" and lines[1].strip() == "total":
        previouslines.append(lines[2].split(' ')[1])
        previouslines.append(lines[4].strip())
file.close()

In [None]:
df1 = pd.read_csv("preprocess.csv", sep=',')
#df1.rename({'Id': 'from_id'}, axis=1, inplace=True)
df1

## Node Txt

In [None]:
id_ = pd.read_csv("node_list.csv", sep=',')
id_.rename({'# FromNodeId': 'from_id', 'ToNodeId': 'to_id'}, axis=1, inplace=True)
id_

## Subset of dataframe

In [None]:
df = df1[(df1['group']=='Video') & (df1['totalreviews']>=100)].reset_index(drop=True)
df = df.iloc[0:100]
df

# Prove network

In [None]:
#take the rows of the id dataframe with the index of the subset
index = df.Id.values
net = id_[id_['from_id'].isin(index)]

G = nx.from_pandas_edgelist(net,source='from_id',target='to_id', create_using=nx.DiGraph())
print(nx.info(G))

In [None]:
le = preprocessing.LabelEncoder()
legend = dict(zip(G.nodes, df1.categories.values))
enc = le.fit_transform(list(legend.values()))

In [None]:
pos = nx.spring_layout(G)

deg_centrality = nx.degree_centrality(G)
centrality = np.fromiter(deg_centrality.values(), float)

In [None]:
cNorm  = colors.Normalize(vmin=0, vmax=max(enc))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('jet'))

f = plt.figure(1,figsize=(10,10))
ax = f.add_subplot(1,1,1)
for label in sorted(list(set(enc))):
    ax.scatter([0],[0],color=scalarMap.to_rgba(label),label=label)

nx.draw_networkx(G, pos,arrowstyle='->',
                 node_size=centrality*6e3,
                 node_color=enc,
                 with_labels=False,
                 #labels=col,
                 cmap=plt.get_cmap('jet'))

box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width , box.height])
plt.legend(loc='upper left', bbox_to_anchor=(1.,.5),title='Average Rate',
          ncol=1, fancybox=True, shadow=True)
plt.show()

In [None]:
degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
dmax = max(degree_sequence)

fig = plt.figure("Degree of a graph", figsize=(14, 14))
# Create a gridspec for adding subplots of different sizes
axgrid = fig.add_gridspec(5, 4)

ax1 = fig.add_subplot(axgrid[3:, :2])
ax1.plot(degree_sequence, "b-", marker="o")
ax1.set_title(" LOG Degree Rank Plot")
ax1.set_ylabel("Degree")
plt.xscale("log")
plt.yscale("log")
ax1.set_xlabel("Rank")

ax2 = fig.add_subplot(axgrid[3:, 2:])
ax2.bar(*np.unique(degree_sequence, return_counts=True))
ax2.set_title("Degree histogram")
ax2.set_xlabel("Degree")
ax2.set_ylabel("# of Nodes")
ax2.set_xlim(0,20)

fig.tight_layout()

plt.show()