In [6]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances #cosine similarity is 1-cosine distance
import os
import re
import pandas as pd
import numpy as np

pd.options.display.max_rows = 20

# Data read and filter

First, I read in the data. Here, I've also filtered the data such that we only keep the rows where the first 3 letters of the 'lang' column are 'sux', corresponding to Sumerian and its dialects.

In [27]:
alltex = pd.read_csv('alltexts copy.csv')
alltex = alltex.loc[[str(i)[0:3] == 'sux' for i in alltex.lang], :]
alltex['Term'] = [str(alltex.cf[i]) + "[" + str(alltex.gw[i]) + "]" + str(alltex.pos[i]) for i in alltex.index]
alltex

Unnamed: 0.1,Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version,Term
0,0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),,dubsaŋ[first]AJ
1,1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),,Enki[1]DN
2,2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),,unu[dwelling]N
3,3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,gal[big]V/i
4,4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,ed[ascend]V/i
5,5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),,anzag[horizon]N
6,6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,anŋi[eclipse]N
7,7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),,zu[know]V/t
8,8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,ama[mother]N
9,9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,tu[incantation]N


In [38]:
sum(alltex.form == "X") #check for Xs

16201

Then, I remove all rows that have a NA in either the 'gw' or 'pos' column.

In [39]:
alltex = alltex.loc[np.logical_and(~pd.isnull(alltex.gw), ~pd.isnull(alltex.pos)), :]
alltex

Unnamed: 0.1,Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version,Term
0,0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),,dubsaŋ[first]AJ
1,1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),,Enki[1]DN
2,2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),,unu[dwelling]N
3,3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,gal[big]V/i
4,4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,ed[ascend]V/i
5,5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),,anzag[horizon]N
6,6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,anŋi[eclipse]N
7,7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),,zu[know]V/t
8,8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,ama[mother]N
9,9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,tu[incantation]N


In [49]:
np.where(np.in1d(alltex.form, ["X", "(X)"])),np.where(np.in1d(alltex.gw, ["X", "(X)"])) 

((array([42007, 42083], dtype=int64),),
 (array([  1038,   1935,   2546,   2549,   2866,  13006,  13009,  13047,
          13061,  13090,  13092,  25547,  25809,  25812,  26163,  27116,
          29903,  32884,  39512,  42007,  42083,  48680,  52906,  56808,
          56835,  57786,  59071,  59931,  65078,  65098,  65443,  66464,
          66616,  67685,  68178,  68710,  71112,  73332,  73645,  76839,
          79738,  84776,  86051,  88135,  88387,  88393,  88400,  88409,
          88415,  88427,  88477,  88488,  88490,  91430,  92990,  93012,
          93634,  93791,  93805,  93813,  93825,  93844,  93846,  93859,
          93866,  93963,  94009,  94032,  94063,  94068,  95184,  97083,
          99410,  99420, 100026, 100027, 100613, 101217, 101267, 101296,
         102368, 104050, 105302, 105375, 105378, 105396, 109016, 109024,
         110705, 110937, 112422, 113292, 113528, 113561, 115017, 117502,
         117900, 123123, 125443, 125445, 125451, 125452, 126024, 127550,
         12

Then, I want to filter out the nonsense X's.

In [None]:
alltex = alltex.loc[np.logical_and(~pd.isnull(alltex.gw), ~pd.isnull(alltex.pos)), :]
alltex

In [100]:
len(np.unique(alltex.Term))

4353

In [101]:
bagofwords = {}
for i in alltex.index:
    if alltex.id_text[i] in bagofwords.keys():
        bagofwords[alltex.id_text[i]] = np.append(bagofwords[alltex.id_text[i]], alltex.Term[i])
    else:
        bagofwords[alltex.id_text[i]] = np.array([alltex.Term[i]])


In [102]:
#sanity checks
np.in1d(list(bagofwords.keys()), np.unique(alltex.id_text))
len(list(bagofwords.keys())) == np.unique(alltex.id_text).shape[0]

True

In [103]:
stopwords = np.array(pd.read_csv("../stopwords_top21.txt", header=None)[0])
stopdocs = np.array(pd.read_csv("../stopdocuments_less50.txt", header = None)[0])

In [104]:
strings = [" ".join(i) for j,i in bagofwords.items() if j not in stopdocs]

In [105]:
cv = CountVectorizer(stop_words= list(stopwords))
dtm = cv.fit_transform(strings)
tt = TfidfTransformer(norm='l1',use_idf=True)
dtm_tf = tt.fit_transform(dtm)

In [106]:
pd.DataFrame(tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5083,5084,5085,5086,5087,5088,5089,5090,5091,5092
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,0.106845,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.15179,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
6,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
7,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
8,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
9,0.047386,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0


To get distances, just use euclidian_distances or cosine_similarity

In [80]:
euclidean_distances(tfidf.toarray()[1, :], tfidf.toarray()[0,:])



array([[ 1.37105333]])

In [81]:
cosine_similarity(tfidf.toarray()[1, :], tfidf.toarray()[0,:])



array([[ 0.06010638]])