In [None]:
import pandas as pd
import numpy as np
import time

In [None]:
D = 1000
W = 500
df = pd.DataFrame(index=["w%d"%w for w in range(W)], columns=["doc%d"%d for d in range(D)], data=np.random.randint(0,100,size=(W,D)))
df.head()

In [None]:
class Graph():
    def __init__(self):
        self.edges = []
        self.size = 0
        self.name = "default"
    def add_edge(self, first, second):
        self.edges.append((first,second))
        self.size += 1
        return self.size - 1
    def add_edge_list(self, edges):
        self.edges = np.append(self.edges,np.array(edges))
        self.edges=self.edges.reshape(int(len(self.edges)/2),2)
        self.size = len(self.edges)
    def __repr__(self):
        return f"Graph {self.name} with {self.size} edges"

In [None]:
g = Graph()
def make_graph(df, g, counts = True):
    D = df.shape[1]
    if counts:
        ecount = np.zeros(df.shape[0]*df.shape[1])
    for i_d in range(D):
        title = df.columns[i_d]
        text = df[title]
        for i_w, word, count in zip(range(len(df.index)), df.index, text):
            if count < 1:
                continue
            if counts:
                e = g.add_edge(i_d, D + i_w)
                ecount[e] = count
            else:
                for n in range(count):
                    g.add_edge(i_d, D + i_w)
%timeit make_graph(df,Graph())

In [None]:
# pool.map() apply() starmap() _async()
import multiprocessing as mp
threads = 4
print("Number of processors: ", mp.cpu_count())

In [None]:
def func(x,message="hello"):
    return [(np.random.randint(D),np.random.randint(D))]
data = np.linspace(1,10,10)

In [None]:
del g
g = Graph()
start = time.time()
pool = mp.Pool(threads)
results = np.array(pool.starmap(func,((x, "apply") for x in data)))
pool.close()
pool.join()
results=results.reshape(int(len(results)),2)
g.add_edge_list(results)
end = time.time()
print(results, g)
print(f"{end-start}")
g.edges

In [None]:
%timeit [func(x) for x in data]

In [None]:
%timeit list(map(func, data))

In [None]:
start = time.time()
g = Graph()
make_graph(df,g)
end = time.time()
print(g)
print(f"{end-start}")

In [None]:
def func(i_w, i_d, count):
    if count >= 1:
        return (i_d, D+i_w) 
        
def make_parallel_graph(df, g, counts = True):
    D = df.shape[1]
    if counts:
        ecount = np.zeros(df.shape[0]*df.shape[1])
        
    for i_d, (doc,text) in enumerate(df.iteritems()):
        title = df.columns[i_d]
        text = df[title]
        pool = mp.Pool(threads)
        results = np.array(pool.starmap(func,((i_w, i_d, count) for i_w, count in zip(range(len(df.index)), text))))
        pool.close()
        pool.join()
        g.add_edge_list(results)

In [None]:
start = time.time()
#del g
g = Graph()
make_parallel_graph(df,g)
end = time.time()
print(g)
print(f"{end-start}")

In [None]:
start = time.time()
del g
g = Graph()
make_graph(df,g)
end = time.time()
print(g)
print(f"{end-start}")

In [None]:
(df!=0).astype(int).sum().sum()

In [None]:
g.edges

In [None]:
g.edges[-1]

In [None]:
pool.close()

# Mazzolini

In [None]:
import os
import pandas as pd
os.chdir("datasets/gtex/10")
df = pd.read_csv("mainTable_counts.csv", index_col=0)
df.shape

In [None]:
import mazzolini, importlib
importlib.reload(mazzolini)
from mazzolini import Mazzolini
model = Mazzolini(df)
    
#%timeit model.run()
#%timeit mazzolini_tf()
#%timeit model.run_parallel()
%timeit model.run_parallel_async()

In [None]:
import os
import pandas as pd
os.chdir("/home/jovyan/work/phd/datasets/gtex/10")
df = pd.read_csv("mainTable_counts_hv.csv", index_col=0)
print(df.shape)

model = Mazzolini(df)

start = time.time()
model.run_parallel_async(4).to_csv("mainTable_counts_hv_null.csv", index=True, header=True)
print(time.time()-start)

In [None]:
f = df.sum(1)
f = f/f.sum()
f_null = pd.read_csv("mainTable_counts_hv_null.csv").sum(1)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(np.sort(f.values)[::-1], lw=5, alpha=0.7)
ax.plot(np.sort((f_null/f_null.sum()).values)[::-1])


ax.set_xscale("log")
ax.set_yscale("log")

plt.show()