In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec

Split COLUMN_NAME

In [2]:
# Split by UpperCase
def splitAtUpperCase(s):
    for i in range(len(s)-1)[::-1]:
        if s[i].isupper() and s[i+1].islower():
            s = s[:i]+' '+s[i:]
        if s[i].isupper() and s[i-1].islower():
            s = s[:i]+' '+s[i:]
            
    return s[1:]  #s.split()

# e.g.
splitAtUpperCase('TheLongANDWindingRoad')

'The Long AND Winding Road'

In [12]:
def process_data(df):

    df = df[['COLUMN_NAME', 'TABLE_NAME', 'ORDINAL_POSITION']]

    for index, row in df.iterrows():
        df.loc[index,'COLUMN_NAME'] = splitAtUpperCase(row['COLUMN_NAME'])
        
    df = df.applymap(str)

    # Grouping one form -> one row of dataframe
    gdf = df.groupby('TABLE_NAME')
    return gdf

Read Train Data

In [13]:
df = pd.read_csv('data/Form relationships.csv')
gdf = process_data(df)
gdf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,COLUMN_NAME,TABLE_NAME,ORDINAL_POSITION
0,Data Protection Key Id,__CDDataProtectionKeys,1
1,Friendly Name,__CDDataProtectionKeys,2
2,Xml,__CDDataProtectionKeys,3
3,System Name,__CDDataProtectionKeys,4
4,Migration Id,__MigrationHistory,1
...,...,...,...
2567,User Profile Event Id,UserProfileEvent,1
2568,Event Type,UserProfileEvent,2
2569,Event Date Time,UserProfileEvent,3
2570,Event Payload,UserProfileEvent,4


Read Test Data

In [11]:
ddf = pd.read_csv('data/Form relationships_dummy tables.csv')
gdf_test = process_data(ddf)
gdf_test.get_group('A')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,COLUMN_NAME,TABLE_NAME,ORDINAL_POSITION
0,Ontology Audit Id,A,1
1,Ontology Event Id,A,2
2,Action,A,3
3,Action Date,A,4
4,Action User Profile Id,A,5
5,Field,A,6
6,Field Name,A,7
7,Previous Value,A,8
8,New Value,A,9
9,Ontology Id,A,10


Dataframe to Sentence

In [33]:
# Dataframe to one paragraph

def stringify(df, option_tb = 0, option_p = 0):
    
    if option_tb == 0 or option_tb == 1: 
        df = df.drop(columns=['TABLE_NAME'])

    if option_p == 0: 
        df = df.drop(columns=['ORDINAL_POSITION'])
            
    lst = df.values.tolist()

    sentence = ''
    for row in lst:
        for el in row:
            if el != 'nan':
                sentence = sentence + el + ' ' #''.join(dictionary etc)
    return sentence

# iterate form -> sentence -> concatenat into a list [,,]
def df_to_sentence(gdf, option_tb = 0, option_p = 0):
    res = []
    
    for table_name, df in gdf:
        sentence = stringify(df,option_tb, option_p)
        if option_tb == 1:
            sentence = sentence + table_name
        res.append({table_name: sentence})
    return res

# Tokenizing / Tagging Data
# Don't need to stringify -> tokenize it again because we have tokenized dataset already
def tagging_data(data):
    res = []
    
    for i, _d in enumerate(data):
        for x in _d:
            res.append(TaggedDocument(words=word_tokenize(_d[x]), tags=[x]))
    return res


Processing Train Dataset

In [78]:
# option_tb 2 : keep table name multiple times
# option_tb 1 : keep table name once
# option_tb 0 : drop table name

# option_p 1 : keep ordinal position
# option_p 0 : drop ordinal position

train_data = df_to_sentence(gdf,option_tb=1,option_p=0)
train_data[0]

{'AgentAction': 'Agent Action Id Micro Service Agent Id Name Description ML Data Type Id Micro Service Implementation AgentAction'}

In [79]:
tagged_data_train = tagging_data(train_data)
tagged_data_train[0]


TaggedDocument(words=['Agent', 'Action', 'Id', 'Micro', 'Service', 'Agent', 'Id', 'Name', 'Description', 'ML', 'Data', 'Type', 'Id', 'Micro', 'Service', 'Implementation', 'AgentAction'], tags=['AgentAction'])

Processing Test Dataset

In [80]:
# iterate form -> sentence -> concatenat into a list [,,]
test_data = df_to_sentence(gdf_test,1,0)
test_data[0]

{'A': 'Ontology Audit Id Ontology Event Id Action Action Date Action User Profile Id Field Field Name Previous Value New Value Ontology Id A'}

In [81]:
tagged_data_test= tagging_data(test_data)
tagged_data_test[0]

TaggedDocument(words=['Ontology', 'Audit', 'Id', 'Ontology', 'Event', 'Id', 'Action', 'Action', 'Date', 'Action', 'User', 'Profile', 'Id', 'Field', 'Field', 'Name', 'Previous', 'Value', 'New', 'Value', 'Ontology', 'Id', 'A'], tags=['A'])

Combine Train + Test Data

In [84]:
comb_data = train_data+test_data
tagged_data_comb = tagging_data(comb_data)

Train w/ Train data

In [93]:
def train(tagged_data):
    model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=500)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=500)
    model.save("data/d2v.model")
    return model

#model = train(tagged_data_train) 
model = train(tagged_data_comb) 


Result

In [86]:
# Pulling similar form ranking of form 'OntologyAudit'
similar_doc = model.docvecs.most_similar('OntologyAudit')
similar_doc

[('A', 0.9907042980194092),
 ('A4', 0.9862944483757019),
 ('A2', 0.9831196665763855),
 ('A1', 0.9797560572624207),
 ('A3', 0.9779426455497742),
 ('A+B', 0.8618450164794922),
 ('OntologyAttachment', 0.7445945143699646),
 ('OntologyEntityAudit', 0.7303792834281921),
 ('B', 0.7225223183631897),
 ('B1', 0.7136251330375671)]

In [None]:
# Pulling similar form ranking of form 'UserProfile'
similar_doc = model.docvecs.most_similar('UserProfile')
similar_doc

: 

Load saved model

In [40]:
loaded_model = Doc2Vec.load("data/d2v.model")

Pairwise matrix for train data

In [41]:
#gdf = odf.groupby('TABLE_NAME')
tables = pd.DataFrame(gdf).loc[:,0]
tables

0                         AgentAction
1                    AgentActionAudit
2                    AgentActionEvent
3           AgentActionImplementation
4      AgentActionImplementationAudit
                    ...              
339                       UserProfile
340                  UserProfileAudit
341                  UserProfileEvent
342            __CDDataProtectionKeys
343                __MigrationHistory
Name: 0, Length: 344, dtype: object

In [42]:
print (loaded_model.docvecs.similarity('UserProfile', 'UserPreference'))
print (loaded_model.docvecs.distance('UserProfile', 'UserPreference'))

0.83749205
0.16250795125961304


Pairwise matrix for test data

In [43]:
test_tables = pd.DataFrame(gdf_test).loc[:,0]
test_tables

0      A
1    A+B
2     A1
3     A2
4     A3
5     A4
6      B
7     B1
8      C
9      D
Name: 0, dtype: object

In [44]:
dummy_table = {}

for i, val in enumerate(test_tables):
    dummy_table[val] = i

dummy_table['A']

0

Calculate dist A-A1 method 1

In [52]:
model.docvecs.similarity_unseen_docs(model, [test_data[dummy_table['A2']]['A2']], [test_data[dummy_table['A4']]['A4']])

-0.0031496563

Calculate dist A-A1 method 2 - result is the same

In [53]:
from numpy import zeros, random, sum as np_sum, add as np_add, concatenate, \
    repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \
    sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide

from gensim import utils, matutils 

alpha=0.1
min_alpha=0.0001
steps=5

d1 = model.infer_vector(doc_words=[test_data[dummy_table['A2']]['A2']], alpha=alpha, min_alpha=min_alpha, steps=steps)
d2 = model.infer_vector(doc_words=[test_data[dummy_table['A4']]['A4']] , alpha=alpha, min_alpha=min_alpha, steps=steps)

similarity_score = dot(matutils.unitvec(d1), matutils.unitvec(d2))

similarity_score

-0.0031496563

In [None]:
    #discussion on unseen_docs
           
    # Gensim code https://tedboy.github.io/nlps/_modules/gensim/models/doc2vec.html
    # discussion https://stackoverflow.com/questions/55924378/doc2vec-finding-document-similarity-in-test-data
    
    # def similarity(self, d1, d2):
    #     """
    #     Compute cosine similarity between two docvecs in the trained set, specified by int index or
    #     string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)
    #     """
    #     return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
    
    # def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5):
        # """
        # Compute cosine similarity between two post-bulk out of training documents.

        # Document should be a list of (word) tokens.
        # """
        # d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps)
        # d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps)
        # return dot(matutils.unitvec(d1), matutils.unitvec(d2))

: 

Matrix for Train data

In [54]:
# table x table score matrix
lst = []

for x in tables:
    score_x = []
    for y in tables:
        if x == y:
            score_x.append(0) # res dist is not exactly 0. Set to 0 here.
        else:
            score_x.append(loaded_model.docvecs.distance(x,y)) 
            
    lst.append(score_x)
    

In [55]:
# list to df
matrix = pd.DataFrame.from_records(lst)
matrix.set_axis(tables, axis=1, inplace=True)
matrix.rename(tables, inplace=True)
matrix.head()

Unnamed: 0,AgentAction,AgentActionAudit,AgentActionEvent,AgentActionImplementation,AgentActionImplementationAudit,AgentActionImplementationEvent,AgentObservable,AgentObservableAudit,AgentObservableEvent,AgentVersion,...,TreeVariableDeclaration,TreeVariableDeclarationAudit,TreeVariableDeclarationEvent,TreeVariableEvent,UserPreference,UserProfile,UserProfileAudit,UserProfileEvent,__CDDataProtectionKeys,__MigrationHistory
AgentAction,0.0,0.519217,0.757169,0.253857,0.389246,0.569316,0.266613,0.383427,0.499094,0.443747,...,0.733058,0.725617,0.854158,0.977543,0.706001,0.886102,1.016597,1.146404,0.448697,0.406012
AgentActionAudit,0.519217,0.0,0.192006,0.205027,0.1174,0.204123,0.371758,0.105497,0.253541,0.388877,...,0.956809,0.652752,0.833195,0.914196,0.745245,0.966017,0.620342,0.797183,0.797306,0.541934
AgentActionEvent,0.757169,0.192006,0.0,0.462492,0.375644,0.126873,0.649904,0.371671,0.168967,0.598176,...,1.171511,0.920973,0.67842,0.715443,0.955958,1.192743,0.720954,0.511798,1.108008,0.783279
AgentActionImplementation,0.253857,0.205027,0.462492,0.0,0.132976,0.305611,0.17766,0.189047,0.331681,0.276178,...,0.776394,0.732144,0.884635,1.030616,0.509477,0.733494,0.785005,0.93907,0.504218,0.338081
AgentActionImplementationAudit,0.389246,0.1174,0.375644,0.132976,0.0,0.169465,0.249242,0.096488,0.302046,0.352695,...,0.74425,0.566929,0.79651,0.93454,0.60076,0.818109,0.635289,0.855649,0.748829,0.527671


In [90]:
# df to csv
matrix.to_csv('result/result_matrix.csv')

Matrix for Test data

In [59]:
# table x table score matrix
# distance = 1 - similarity score
lst_test = []

for x in test_tables:
    score_x = []
    for y in test_tables:
        if x == y:
            score_x.append(0) # res dist is not exactly 0. Set to 0 here.
        else:
            dist = 1 - (loaded_model.docvecs.similarity_unseen_docs(loaded_model, [x],[y])) #np.abs
            score_x.append(dist) 
            
    lst_test.append(score_x)

In [60]:
# list to df
matrix_test = pd.DataFrame.from_records(lst_test)
matrix_test.set_axis(test_tables, axis=1, inplace=True)
matrix_test.rename(test_tables, inplace=True)
matrix_test.head()

Unnamed: 0,A,A+B,A1,A2,A3,A4,B,B1,C,D
A,0.0,0.947958,1.163782,1.231541,0.968117,0.904569,0.965783,1.022006,1.17108,0.522372
A+B,0.947958,0.0,0.89897,0.824369,0.971226,0.809922,1.125264,0.927892,1.052653,0.85241
A1,1.163782,0.89897,0.0,0.828293,0.891251,1.006988,0.907615,0.907327,1.05393,0.974571
A2,1.231541,0.824369,0.828293,0.0,1.063617,0.942924,0.902616,0.852147,0.850634,1.151293
A3,0.968117,0.971226,0.891251,1.063617,0.0,1.07061,0.98951,0.496658,1.072097,0.98311


Distance Tree for Training Data

In [61]:
from skbio import DistanceMatrix
from skbio.tree import nj
#run: pip install scikit-bio

In [62]:
dm = DistanceMatrix(matrix, tables)
tree = nj(dm)
str_tree = tree.ascii_art()
#print(str_tree)

In [92]:
text_file = open("result/tree_train.txt", "w")
text_file.write(str_tree)
text_file.close()

Distance Tree for Test Data

In [91]:
dm = DistanceMatrix(matrix_test, test_tables)
tree = nj(dm)
str_tree = tree.ascii_art()

text_file = open("result/tree_test.txt", "w")
text_file.write(str_tree)
text_file.close()