# Load models

Flags for the models (vocabulary size: 30000)

m0 : model for the whole Kanseki Repository

m1 : models for six parts (６部)

m2 : models for 75 divisions (75部類)

Vocabulary size: 20000

m5 : : models for 75 divisions (75部類)


In [None]:
import krp_sp as k
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx
import pandas as pd
import numpy as np


# load the log values : , log=True
ms, md, mv = k.loadmodels(flag="m2",  use_krp_names=False)
len(md)

Load random samples from the data files 

In [None]:
smp = k.loadsamples()

In [None]:
print(smp[1])

In [None]:
def getbest(md, sntc, mv=None, cnt=3):
    """Get the best result for the sentence provided. Optionally, return cnt results. SMP is the loaded list of models. Optionally provide a list with log values."""
    res=[]
    for i, mx in enumerate(md):
        ids = mx.encode_as_ids(sntc)
        if mv:
            r = sum([mv[i][a] for a in ids]) / len(ids)
        else:
            r = sum([mx.GetScore(a) for a in ids]) / len(ids)
        res.append((i, r))
    resx=sorted(res, key=lambda x: x[1], reverse=True)
    #print(resx)
    return [a[0] for a in resx[0:cnt]]


In [None]:
test=smp[1][1][0]
print(test)
t2 = md[1].encode_as_pieces(test)
t1 = sum([md[1].GetScore(a) for a in  md[1].encode_as_ids(test)])
print(t1, t2)

In [None]:
k.getbest(md, smp[1][0][1], None, 1)

In [None]:
from collections import defaultdict
kn=dict([(a[1],a[0]) for a in k.krp_names.items()])
smpres=[]
for s in smp:
    d=defaultdict(int)
    n=kn[s[0]]
    for s1 in s[1]:
        res=[ms[a] for a in getbest(md, s1)]
        for i, rx in enumerate(res):
            score= 3 - i
            d[rx] += score
    dr = sorted(d.items(), key = lambda i: i[1], reverse=True)
    cr = sum([a[1] for a in dr if a[0]==n])
    smpres.append (("%s%s" % (n,s[0]), cr, ",".join(["%s:%s" % a for a in dr])))

In [None]:
df=pd.DataFrame(smpres)
print(df)

In [None]:
len(mv[0])

How many characters are in every vocab list?

In [None]:
from collections import defaultdict
vd=defaultdict(int)
for vl in mv:
    for v in vl:
        vd[v] += 1
vtab=defaultdict(int)
for v in vd.items():
    vtab[v[1]] += 1

In [None]:
vds = sorted(vtab.items(), key = lambda i: i[0], reverse=True)
df=pd.DataFrame(vds, columns=["OccNo", "Count"])
print(df)

Turn the vocabulary list of list around, list by sp

In [None]:
from collections import defaultdict
vx=defaultdict(list)
for i, vl in enumerate(mv):
    for v in vl:
        vx[v].append(ms[i])


Look at vocab entries with only 2 occurrences

In [None]:
v2 = [a for a in vx.items() if len(a[1]) == 2]
vd2 = defaultdict(lambda: defaultdict(int))
for vdn in v2:
    l1 = vdn[1][0]
    l2 = vdn[1][1]
    vd2[l1][l2]+=1


In [None]:
o = []
for g1 in vd2:
    for g2 in vd2[g1]:
        w = vd2[g1][g2]
        if w > 40:
            o.append((g1, g2, w))

In [None]:
G = nx.MultiGraph()
#o=[(g1, g2, vd2[g1][g2]) for g2 in vd2[g1] for g1 in vd2 if vd2[g1][g2] > 10]
G.add_weighted_edges_from(o)
        

In [None]:
nx.draw(G, with_labels=True, figsize=(2000, 2000))

In [None]:
for k in G.nodes():
    print (k, G[k].items())

In [None]:
import matplotlib.font_manager as mfm
import matplotlib.pyplot as plt

font_path = "/usr/share/fonts/opentype/noto/NotoSansCJK-Thin.ttc"
prop = mfm.FontProperties(fname=font_path)
plt.text(0.5, 0.5, s='测试', fontproperties=prop, size='40')
plt.show()

In [None]:
#kn=k.krp_names
#o=[("%s%s" % (a[0],kn[a[0]]), "%s%s" % (a[1], kn[a[1]]), a[2]) for a in o]
o=sorted(o, key = lambda x : x[2], reverse=True)
df=pd.DataFrame(o, columns=["Node1", "Node2", "W"])

In [None]:
print(df)

In [None]:
for node, nbrsdict in G.adj.items():
    print (node, nbrsdict)

In [None]:
md[0].GetScore(33)

In [None]:
import math
math.exp(md[0].GetScore(33))

In [None]:
from collections import defaultdict
vx=defaultdict(list)
for i, vl in enumerate(mv):
    for v in vl:
        vx[v].append(ms[i])


In [None]:
dir(md[0])

In [None]:
md[0].PieceToId(mv[0][22])

In [None]:
from collections import defaultdict
vx=defaultdict(list)
for i, vl in enumerate(mv):
    for j, v in enumerate(vl):
        # j is the id of this sp in this model
        sc = md[i].GetScore(j)
        # for this sp, in this model, we record the score
        vx[v].append((i, sc))


In [None]:
vk=list(v2.keys())
vk[10]

In [None]:
len(vx[vk[10]])

In [None]:
import numpy as np
up = 60
lo = 1
cnt = 1
sx=np.zeros((75,75))

#populate vx
from collections import defaultdict
vx=defaultdict(list)
for i, vl in enumerate(mv):
    for j, v in enumerate(vl):
        # j is the id of this sp in this model
        sc = md[i].GetScore(j)
        # for this sp, in this model, we record the score
        vx[v].append((i, sc))
# limit to our target
v2 = [a for a in vx.items() if len(a[1]) > lo and len(a[1]) < up]
for v in v2:
    for bu, p in v[1]:
        for bu1, p1 in v[1]:
            # I only want to see one half of the matrix
            if bu < bu1:
                px = p + p1
                # only count
                if cnt == 1:
                    sx[bu, bu1] += cnt
                else:
                    sx[bu, bu1] += px
# evaluate the result
res={}
for n in range(75):
    o = []
    for i in range(75):
        if n < i:
            o.append((ms[i], sx[n, i]))
        else:
            o.append((ms[i], sx[i, n]))
    if cnt == 1:
        o = sorted(o, key = lambda x : x[1], reverse=True)
    else:
        o = sorted(o, key = lambda x : x[1], reverse=False)
    res[ms[n]] = o[0:10]
# print the result
kr=list(res.keys())
kr.sort()
for r in kr:
    rx=["%s:%s" % (a[0], k.krp_names[a[0]]) for a in res[r]]
    print (r, k.krp_names[r], ",".join(rx[0:5]))

In [None]:
import numpy as np
sx=np.zeros((75,75))
for v in v2:
    for bu, p in v[1]:
        for bu1, p1 in v[1]:
            # I only want to see one half of the matrix
            if bu < bu1:
                px = p + p1
                # only count
                sx[bu, bu1] += px
    

In [None]:
n=33
for i in range(75):
    if n < i:
        print (n, i, sx[n, i])
    else:
        print (n, i, sx[i, n])
        

In [None]:
import numpy as np

sx=np.zeros((75,75))

In [None]:
n=10
res={}
for n in range(75):
    o = []
    for i in range(75):
        if n < i:
            o.append((ms[i], sx[n, i]))
        else:
            o.append((ms[i], sx[i, n]))
    o = sorted(o, key = lambda x : x[1], reverse=False)
    res[ms[n]] = o[0:10]

In [None]:
G = nx.MultiGraph()
#o=[(g1, g2, vd2[g1][g2]) for g2 in vd2[g1] for g1 in vd2 if vd2[g1][g2] > 10]
G.add_weighted_edges_from(o3)

In [None]:
nx.draw(G, with_labels=True, figsize=(2000, 2000))

In [None]:
o2 = sorted(o, key = lambda x : x[2], reverse=True)
o3 = o2[0:10]

In [None]:
kr=list(res.keys())
kr.sort()
for r in kr:
    rx=["%s:%s" % (a[0], k.krp_names[a[0]]) for a in res[r]]
    print (r, k.krp_names[r], ",".join(rx[0:5]))

In [None]:
ms[0]

In [None]:
v

In [None]:
x1=    {'好施' :
    [(13, -11.637325286865234),
    (16, -11.83456039428711),
    (18, -10.804060935974121),
    (20, -11.320064544677734),
    (40, -12.115601539611816),
    (41, -11.592039108276367),
    (42, -11.932478904724121),
    (56, -12.036428451538086),
    (72, -12.097436904907227)]}
x1

In [None]:
v2 = dict([a for a in vx.items() if len(a[1]) > lo and len(a[1]) < up])

In [None]:
len(v2)

In [None]:
(23 + 25) / 2

In [None]:
s1, s2 = sx.shape

In [None]:
s2