In [1]:
import pandas as pd
from sklearn.metrics import f1_score

msrp = pd.read_csv('data/processed/msrp_train_translated_id.txt', sep='\t', quotechar='^')

In [2]:
def inspect(row):
    print(row['sentence1'])
    print(row['sentence2'])
    print(row['label'])
    

In [3]:
inspect(msrp.loc[101])

Lampu keamanan juga telah dipasang dan polisi telah menyapu lahan untuk perangkap booby.
Lampu keamanan juga telah dipasang di gudang di dekat gerbang depan.
0


# 2.A. Jaccard Score

In [4]:
from mpstemmer import MPStemmer
import string

stemmer = MPStemmer()

def preprocess_sentence(list_of_sentences):
    preprocessed = []
    for sentence in list_of_sentences:
        sentence = sentence.translate(str.maketrans('','',string.punctuation))
        print(sentence)
        sentence = sentence.lower()
        sentence = stemmer.stem_kalimat(sentence)
        print(sentence)
        preprocessed.append(sentence)
    return preprocessed

In [5]:
instance = msrp.iloc[101:102].copy()

In [6]:
instance['preprocessed_sentence1'] = preprocess_sentence(instance['sentence1'])
instance['preprocessed_sentence2'] = preprocess_sentence(instance['sentence2'])

Lampu keamanan juga telah dipasang dan polisi telah menyapu lahan untuk perangkap booby
lampu aman juga telah pasang dan polisi telah sapu lahan untuk perangkap booby
Lampu keamanan juga telah dipasang di gudang di dekat gerbang depan
lampu aman juga telah pasang di gudang di dekat gerbang depan


In [24]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score

def extract_feat_1(df_train, df_test, inductive=True):
    if inductive:
        all_sentences = np.concatenate([df_train['preprocessed_sentence1'], df_train['preprocessed_sentence2']])
    else:
        all_sentences = np.concatenate([df_train['preprocessed_sentence1'], df_train['preprocessed_sentence2'],
                                        df_test['preprocessed_sentence1'], df_test['preprocessed_sentence2']])
    vec = CountVectorizer(binary=True)
    vec.fit(all_sentences)
    
    X1_train = vec.transform(df_train['preprocessed_sentence1']).toarray()
    X2_train = vec.transform(df_train['preprocessed_sentence2']).toarray()
    print(X1_train)
    print(X2_train)
    
    X1_test = vec.transform(df_test['preprocessed_sentence1']).toarray()
    X2_test = vec.transform(df_test['preprocessed_sentence2']).toarray()
    
    feat_1_train = [jaccard_score(x1, x2, average='binary') for x1, x2 in zip(X1_train, X2_train)]
    feat_1_test = [jaccard_score(x1, x2, average='binary') for x1, x2 in zip(X1_test, X2_test)]
    print(feat_1_train)
    
    df_feat_train = pd.DataFrame(feat_1_train, columns=['Jaccard_Score'])
    df_feat_train['label'] = df_train['label']
    
    df_feat_test = pd.DataFrame(feat_1_test, columns=['Jaccard_Score'])
    df_feat_test['label'] = df_test['label']
    
    return df_feat_train, df_feat_test

In [25]:
train, test = extract_feat_1(instance, instance, inductive=True)

[[1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1]]
[[1 0 0 1 1 1 1 1 1 0 1 1 0 0 0 1 0]]
[0.29411764705882354]


# 2.B. SMATCH

In [7]:
def inspect_2_b(row):
    print(row['amr1'])
    print(row['amr2'])

In [4]:
amr = pd.read_csv('data/processed/amr_msrp_train.csv')

In [30]:
inspect_2_b(amr.loc[101])

# ::id 101
# ::annotator indoamrbart-mbart-fted
# ::snt Lampu keamanan juga telah dipasang dan polisi telah menyapu lahan untuk perangkap booby.
(z0 / dan
    :op1 (z1 / pasang-01
             :ARG1 (z2 / lampu
                       :mod (z3 / keamanan))
             :time (z4 / juga))
    :op2 (z5 / menyapu-01
             :ARG0 (z6 / polisi)
             :ARG1 (z7 / lahan)
             :purpose (z8 / perangkap
                          :mod (z9 / booby))))
# ::id 101
# ::annotator indoamrbart-mbart-fted
# ::snt Lampu keamanan juga telah dipasang di gudang di dekat gerbang depan.
(z0 / pasang-01
    :ARG0 (z1 / orang
              :ARG0-of (z2 / memiliki-peran-org-91
                           :ARG2 (z3 / polisi)))
    :ARG1 (z4 / lampu
              :mod (z5 / keamanan))
    :location (z6 / gudang
                  :location (z7 / dekat
                                :op1 (z8 / pintu
                                         :mod (z9 / depan))))
    :mod (z10 / juga))


In [9]:
import pandas as pd

def load_amr_entries_from_df(df, column_name):
    data = '\n\n'.join(df[column_name].tolist())
    
    lines = [l for l in data.splitlines() if not l.startswith('#')]
    data = '\n'.join(lines)
    
    entries = data.split('\n\n')
    entries = [e.strip() for e in entries]
    entries = [e for e in entries if e]
    
    return entries

In [32]:
from amrlib.evaluate.smatch_enhanced import compute_smatch

amr1_entries = load_amr_entries_from_df(amr.iloc[101:102], 'amr1')
amr2_entries = load_amr_entries_from_df(amr.iloc[101:102], 'amr2')

assert len(amr1_entries) == len(amr2_entries), "Number of amr 1 and amr 2 must be the same"

smatch_scores = []

i = 0
for amr1_graph, amr2_graph in zip(amr1_entries, amr2_entries):
    amr1_graph = [amr1_graph]
    amr2_graph = [amr2_graph]
    precision, recall, f_score = compute_smatch(amr1_graph, amr2_graph)
    smatch_scores.append(f_score)
    i = i+1
    print(i)

1


In [33]:
smatch_scores

[0.380952380952381]

# 2.C


In [5]:
from mpstemmer import MPStemmer
import string
import penman

stemmer = MPStemmer()


def extract_concepts(amr):
    graph = penman.decode(amr)
    words = [node[2] for node in graph.instances()]
    # words = [word.replace('-', '').rstrip('0123456789') for word in words]
    for triple in graph.triples:
        if triple[1].startswith((":op", ":time")):
            if triple[2][0] == "z" and not triple[2][1:].isdigit():
                words.append(triple[2])
            elif triple[2][0] != "z":
                words.append(triple[2])
        elif triple[1].startswith(":wiki") and triple[2] != "-":
            words.append(triple[2])
    return words

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score   

def extract_feat(df_train, df_test, inductive=True):
    if inductive:
        all_sentences = np.concatenate([df_train['concepts_amr1'], df_train['concepts_amr2']])
    else:
        all_sentences = np.concatenate([df_train['concepts_amr1'], df_train['concepts_amr2'],
                                        df_test['concepts_amr1'], df_test['concepts_amr2']])
    vec = CountVectorizer(binary=True, token_pattern=r'\b\w[\w-]*\b')
    vec.fit(all_sentences)
    
    X1_train = vec.transform(df_train['concepts_amr1']).toarray()
    X2_train = vec.transform(df_train['concepts_amr2']).toarray()
    print(X1_train)
    print(X2_train)
    
    X1_test = vec.transform(df_test['concepts_amr1']).toarray()
    X2_test = vec.transform(df_test['concepts_amr2']).toarray()
    
    feat_1_train = [jaccard_score(x1, x2, average='binary') for x1, x2 in zip(X1_train, X2_train)]
    feat_1_test = [jaccard_score(x1, x2, average='binary') for x1, x2 in zip(X1_test, X2_test)]
    print(feat_1_train)
    
    df_feat_train = pd.DataFrame(feat_1_train, columns=['Jaccard_Score'])
    df_feat_train['label'] = df_train['label']
    df_feat_train['feat_smatch'] = df_train['feat_smatch']
    
    df_feat_test = pd.DataFrame(feat_1_test, columns=['Jaccard_Score'])
    df_feat_test['label'] = df_test['label']
    df_feat_test['feat_smatch'] = df_test['feat_smatch']
    
    return df_feat_train, df_feat_test


In [35]:
instance_2_c = amr.iloc[101:102].copy()

In [37]:
instance_2_c['concepts_amr1'] = instance_2_c['amr1'].apply(extract_concepts)
instance_2_c['concepts_amr2'] = instance_2_c['amr2'].apply(extract_concepts)
instance_2_c['concepts_amr1'] = instance_2_c['concepts_amr1'].apply(' '.join)
instance_2_c['concepts_amr2'] = instance_2_c['concepts_amr2'].apply(' '.join)

['dan', 'pasang-01', 'lampu', 'keamanan', 'juga', 'menyapu-01', 'polisi', 'lahan', 'perangkap', 'booby']
['pasang-01', 'orang', 'memiliki-peran-org-91', 'polisi', 'lampu', 'keamanan', 'gudang', 'dekat', 'pintu', 'depan', 'juga']


In [41]:
print(instance_2_c.loc[101]['concepts_amr1'])
print(instance_2_c.loc[101]['concepts_amr2'])

dan pasang-01 lampu keamanan juga menyapu-01 polisi lahan perangkap booby
pasang-01 orang memiliki-peran-org-91 polisi lampu keamanan gudang dekat pintu depan juga


In [None]:
train, test = extract_feat(instance_2_c, instance_2_c, inductive=True)

# 3.A.


In [11]:
instance

Unnamed: 0,sentence1,sentence2,label,preprocessed_sentence1,preprocessed_sentence2
101,Lampu keamanan juga telah dipasang dan polisi ...,Lampu keamanan juga telah dipasang di gudang d...,0,lampu aman juga telah pasang dan polisi telah ...,lampu aman juga telah pasang di gudang di deka...


In [24]:
instance_3_a= amr.iloc[101:102].copy()

In [25]:
instance_3_a['concepts_amr1'] = instance_3_a['amr1'].apply(extract_concepts)
instance_3_a['concepts_amr2'] = instance_3_a['amr2'].apply(extract_concepts)
instance_3_a['concepts_amr1'] = instance_3_a['concepts_amr1'].apply(' '.join)
instance_3_a['concepts_amr2'] = instance_3_a['concepts_amr2'].apply(' '.join)

In [26]:
print(instance_3_a.iloc[0]['concepts_amr1'])
print(instance_3_a.iloc[0]['concepts_amr2'])

dan pasang-01 lampu keamanan juga menyapu-01 polisi lahan perangkap booby
pasang-01 orang memiliki-peran-org-91 polisi lampu keamanan gudang dekat pintu depan juga


In [100]:

import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import vstack, hstack

def extr_feat(df_train, df_test, factor=True, n_comp=100, inductive=True):
    df_train = df_train.copy()
    df_test = df_test.copy()
    if inductive:
        all_sent = np.concatenate([df_train['concepts_amr1'], df_train['concepts_amr2']])
    else:
        all_sent = np.concatenate([df_train['concepts_amr1'], df_train['concepts_amr2'],
                                    df_test['concepts_amr1'], df_test['concepts_amr2']])
    vec = TfidfVectorizer(token_pattern=r'\b\w[\w-]*\b')
    vec.fit(all_sent)

    X1_train = vec.transform(df_train['concepts_amr1'])
    # print(X1_train.toarray())
    X2_train = vec.transform(df_train['concepts_amr2'])
    X_train = vstack((X1_train, X2_train))
    # print(X_train.shape)
    X1_test = vec.transform(df_test['concepts_amr1'])
    X2_test = vec.transform(df_test['concepts_amr1'])

    print("vocab size:" ,len(vec.vocabulary_))
    if (factor):
        n_components = math.floor(n_comp * len(vec.vocabulary_))
    else:
        n_components = n_comp
    lsa = TruncatedSVD(n_components=n_components, random_state=42)
    X_train = lsa.fit_transform(X_train)
    # print(X_train.shape)
    X_train = np.vsplit(X_train, 2)
    sum_res = X_train[0] + X_train[1]
    # print(X_train[0])
    # print(X_train[1])
    diff_res = np.abs(X_train[0] - X_train[1])
    feat_1_train = np.column_stack((sum_res, diff_res))
    lsa_df_train = pd.DataFrame(feat_1_train.tolist(), columns=[f'f{i}' for i in range(feat_1_train.shape[1])], index=df_train.index)
    # lsa_df_train['label'] = df_train['label']
    print(feat_1_train)
    X1_test = lsa.transform(X1_test)
    X2_test = lsa.transform(X2_test)
    sum_res = X1_test + X2_test
    diff_res = np.abs(X1_test - X2_test)
    feat_1_test = np.column_stack((sum_res, diff_res))
    lsa_df_test = pd.DataFrame(feat_1_test.tolist(), columns=[f'f{i}' for i in range(feat_1_test.shape[1])], index=df_test.index)
    # lsa_df_test['label'] = df_test['label']
    
    return lsa_df_train, lsa_df_test


In [78]:
train, test = extr_feat(instance_3_a, instance_3_a, factor=True, n_comp=0.5, inductive=True)


vocab size: 41
[[ 0.46316594 -0.70505257 -0.47469192 -0.25108916]
 [ 0.6940693   0.47880313  0.22445288 -0.48850411]]
[[ 0.6115788  -0.56750989  0.4727312   0.28360027]
 [ 0.66464144  0.51352621 -0.33858422  0.42415011]]


In [101]:
train, test = extr_feat(instance_3_a, instance_3_a, factor=False, n_comp=8, inductive=True)


vocab size: 125
[[ 3.23017042e-01  6.88409879e-01 -9.80767675e-01 -1.11356075e+00
  -8.88529300e-02 -1.34762002e-01  2.67368222e-01  1.42054712e-01
   1.22844301e-01  9.18883274e-02  7.61860503e-03  6.08943718e-02
   1.70787784e-02  6.16997418e-03  9.32053389e-02  1.06340122e-01]
 [ 1.04952311e+00 -1.40112162e-01 -7.08967489e-02 -1.30320959e-01
   2.68625995e-01 -2.81825081e-01 -6.46647205e-01 -9.52837034e-01
   9.63392207e-02  7.38595214e-03  3.18781827e-02  1.84350970e-02
   8.04234272e-02  3.30210828e-02  7.43508950e-02  1.49751620e-01]
 [ 8.56483788e-01 -7.23097221e-02  5.26473529e-01  5.30682893e-02
  -1.12142490e-01 -4.95616983e-01  1.22133812e+00 -1.94090747e-01
   1.64244954e-01  9.91003221e-02  7.75966297e-02  1.37849438e-03
   1.60610290e-03  5.03545592e-02  1.10087454e-01  2.17652753e-02]
 [ 1.16022720e+00 -2.79136165e-01  1.69202572e-01 -3.83518286e-03
   4.44371597e-02 -4.25562444e-01 -4.68846620e-01  1.01874604e+00
   1.92632432e-01  2.50012869e-02  6.26005857e-02  3.9231

In [28]:
train

Unnamed: 0,f0,f1,f2,f3
101,1.622212,-4.440892e-16,3.330669e-16,1.169799


# 3. B

In [8]:
def merge_two_amr(amr1, amr2):
    g1 = penman.decode(amr1)
    g2 = penman.decode(amr2)
    g2 = [('y' + triple[0][1:], triple[1], triple[2]) for triple in g2.triples]
    for triple in g2.copy():
        if (triple[2].startswith('z') and triple[2][1:].isdigit()):
            g2.append((triple[0], triple[1], 'y' + triple[2][1:]))
            g2.remove(triple)
    for triple in g2:
        if triple not in g1.triples:
            g1.triples.append(triple)
            
    words = []
    for triple in g1.instances():
        ada = False
        for tup in words:
            if tup[1] == triple[2]:
                ada = True
        if not ada:
            words.append((triple[0],triple[2]))
    words = dict(words)
    graph = g1.triples.copy()
    graph_merged = graph.copy()

    for triple in graph:
        if triple[0] not in words:
            for inner in graph:
                if triple[0]==inner[0] and inner[1]==':instance':
                    for v,i in words.items():
                        if i==inner[2]:
                            change = triple[0]
                            to = v  
                            temp = graph_merged.copy()
                            for supinner in temp:
                                if supinner[0]==change:
                                    graph_merged.append((to, supinner[1], supinner[2]))
                                    graph_merged.remove(supinner)
                            temp = graph_merged.copy()
                            for supinner in temp:    
                                if supinner[2]==change:
                                    graph_merged.append((supinner[0], supinner[1], to))
                                    graph_merged.remove(supinner)
    return graph_merged 

In [30]:
instance_3_a['merged_amr'] = instance_3_a.apply(lambda row: merge_two_amr(row['amr1'], row['amr2']), axis=1)


In [11]:
instance_3_a

Unnamed: 0,amr1,amr2,concepts_amr1,concepts_amr2,merged_amr
101,# ::id 101\n# ::annotator indoamrbart-mbart-ft...,# ::id 101\n# ::annotator indoamrbart-mbart-ft...,dan pasang-01 lampu keamanan juga menyapu-01 p...,pasang-01 orang memiliki-peran-org-91 polisi l...,"[(z0, :instance, dan), (z0, :op1, z1), (z1, :i..."
102,# ::id 102\n# ::annotator indoamrbart-mbart-ft...,# ::id 102\n# ::annotator indoamrbart-mbart-ft...,umum-01 orang nama hanya menggantikan-01 orang...,hitung-01 partai besar orang nama memiliki-per...,"[(z0, :instance, umum-01), (z0, :ARG0, z1), (z..."
103,# ::id 103\n# ::annotator indoamrbart-mbart-ft...,# ::id 103\n# ::annotator indoamrbart-mbart-ft...,katakan-01 dia mungkin-01 memperluas-01 serang...,katakan-01 orang nama mungkin-01 melebar-01 me...,"[(z0, :instance, katakan-01), (z0, :ARG0, z1),..."
104,# ::id 104\n# ::annotator indoamrbart-mbart-ft...,# ::id 104\n# ::annotator indoamrbart-mbart-ft...,luncurkan-01 orang nama orang berhasil-01 doku...,mungkin-01 ini dokumen apapun mendukung-01 pro...,"[(z0, :instance, luncurkan-01), (z0, :ARG0, z1..."
105,# ::id 105\n# ::annotator indoamrbart-mbart-ft...,# ::id 105\n# ::annotator indoamrbart-mbart-ft...,katakan-01 orang nama memiliki-peran-org-91 pe...,memilukan-01 komunitas seluruh universitas nam...,"[(z0, :instance, katakan-01), (z0, :ARG0, z1),..."
106,# ::id 106\n# ::annotator indoamrbart-mbart-ft...,# ::id 106\n# ::annotator indoamrbart-mbart-ft...,ujar-01 dia obligasi-01 aku bertemu-01 salah,obligasi-01 aku berada-01 bertemu-01 salah dan...,"[(z0, :instance, ujar-01), (z0, :ARG0, z1), (z..."
107,# ::id 107\n# ::annotator indoamrbart-mbart-ft...,# ::id 107\n# ::annotator indoamrbart-mbart-ft...,kirim-01 agen FBI selusin dan aman-01 bukti an...,kirim-01 orang nama memiliki-peran-org-91 orga...,"[(z0, :instance, kirim-01), (z0, :ARG0, z1), (..."
108,# ::id 108\n# ::annotator indoamrbart-mbart-ft...,# ::id 108\n# ::annotator indoamrbart-mbart-ft...,operasi-01 mereka operasi hanya kuantitas-seme...,kelangsungan-01 hidup kuantitas-sementara bula...,"[(z0, :instance, operasi-01), (z0, :ARG0, z1),..."


In [10]:
print(instance_3_a.iloc[0]['merged_amr'])

[('z0', ':instance', 'dan'), ('z0', ':op1', 'z1'), ('z1', ':instance', 'pasang-01'), ('z1', ':ARG1', 'z2'), ('z2', ':instance', 'lampu'), ('z2', ':mod', 'z3'), ('z3', ':instance', 'keamanan'), ('z1', ':time', 'z4'), ('z4', ':instance', 'juga'), ('z0', ':op2', 'z5'), ('z5', ':instance', 'menyapu-01'), ('z5', ':ARG0', 'z6'), ('z6', ':instance', 'polisi'), ('z5', ':ARG1', 'z7'), ('z7', ':instance', 'lahan'), ('z5', ':purpose', 'z8'), ('z8', ':instance', 'perangkap'), ('z8', ':mod', 'z9'), ('z9', ':instance', 'booby'), ('y1', ':instance', 'orang'), ('y2', ':instance', 'memiliki-peran-org-91'), ('y6', ':instance', 'gudang'), ('y7', ':instance', 'dekat'), ('y8', ':instance', 'pintu'), ('y9', ':instance', 'depan'), ('y2', ':ARG0', 'y1'), ('y6', ':location', 'y7'), ('y7', ':op1', 'y8'), ('y8', ':mod', 'y9'), ('z1', ':instance', 'pasang-01'), ('z1', ':ARG0', 'y1'), ('z1', ':location', 'y6'), ('z6', ':instance', 'polisi'), ('y2', ':ARG2', 'z6'), ('z2', ':instance', 'lampu'), ('z1', ':ARG1', 'z2'

In [27]:
instance_3_a = instance_3_a.reset_index(drop=True)

In [21]:
instance_3_a = instance_3_a.drop(columns=['index'])

In [32]:



import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import vstack, hstack, csr_matrix
import networkx as nx
import re

def calculate_pagerank(graph_merged):
    G = nx.DiGraph()
    for triple in graph_merged:
        if(triple[1] != ':instance'):
            G.add_edge(triple[0], triple[2])
    pagerank = nx.pagerank(G, alpha=0.85, max_iter=1000)
    return pagerank

def extr_tf(df_train, df_test, factor=False, n_comp=100, inductive=True):
    df_train = df_train.copy()
    df_test = df_test.copy()
    if inductive:
        all_sent = np.concatenate([df_train['concepts_amr1'], df_train['concepts_amr2']])
    else:
        all_sent = np.concatenate([df_train['concepts_amr1'], df_train['concepts_amr2'],
                                    df_test['concepts_amr1'], df_test['concepts_amr2']])
    vec = CountVectorizer(token_pattern=r'\b\w[\w-]*\b')
    vec.fit(all_sent)

    X1_train = vec.transform(df_train['concepts_amr1'])
    print(X1_train.toarray())
    X2_train = vec.transform(df_train['concepts_amr2'])
    print(X2_train.toarray())
    X1_train = X1_train.astype(float)
    X2_train = X2_train.astype(float)
    for index, row in df_train.iterrows():
        amr = row['merged_amr']        
        pagerank = calculate_pagerank(amr)
        for node in pagerank:
            for triple in amr:
                if ((triple[0] == node) and (triple[1] == ":instance")) or triple[2]==node:
                    word = triple[2]
                    word = re.sub(r'[^A-Za-z0-9-]', '', word)
                    word = word.lower()
                    if word in vec.vocabulary_:
                        loc_word = vec.vocabulary_[word]
                        X1_train[index, loc_word] *= (pagerank[node])
                        X2_train[index, loc_word] *= (pagerank[node])
        
    print(X1_train.toarray())
    print(X2_train.toarray())
    
    X1_test = vec.transform(df_test['concepts_amr1'])
    X2_test = vec.transform(df_test['concepts_amr2'])
    X1_test = X1_test.astype(float)
    X2_test = X2_test.astype(float)
    for index, row in df_test.iterrows():
        amr = row['merged_amr']        
        pagerank = calculate_pagerank(amr)
        for node in pagerank:
            for triple in amr:
                if ((triple[0] == node) and (triple[1] == ":instance")) or triple[2]==node:
                    word = triple[2]
                    word = re.sub(r'[^A-Za-z0-9-]', '', word)
                    word = word.lower()
                    if word in vec.vocabulary_:
                        loc_word = vec.vocabulary_[word]
                        X1_test[index, loc_word] *= (pagerank[node])
                        X2_test[index, loc_word] *= (pagerank[node])
    
    X_train = vstack((X1_train, X2_train))
    if (factor):
        n_components = math.floor(n_comp * len(vec.vocabulary_))
    else:
        n_components = n_comp
    lsa = TruncatedSVD(n_components=n_components, random_state=42)
    X_train = lsa.fit_transform(X_train)
    
    X_train = np.vsplit(X_train, 2)
    sum_res = X_train[0] + X_train[1]
    print(X_train[0])
    print(X_train[1])
    diff_res = np.abs(X_train[0] - X_train[1])
    feat_1_train = np.column_stack((sum_res, diff_res))
    print(feat_1_train)
    feat_1_train = csr_matrix(feat_1_train)

    X1_test = lsa.transform(X1_test)
    X2_test = lsa.transform(X2_test)
    sum_res = X1_test + X2_test
    diff_res = np.abs(X1_test - X2_test)
    feat_1_test = np.column_stack((sum_res, diff_res))
    feat_1_test = csr_matrix(feat_1_test)

    return feat_1_train, feat_1_test


In [33]:
_, _ = extr_tf(instance_3_a, instance_3_a, False, 2, inductive=True)

[[1 1 0 0 0 1 1 1 1 0 1 0 1 1 0 1]]
[[0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1]]
[[0.07919248 0.03610859 0.         0.         0.         0.00221301
  0.00579037 0.05068739 0.00221301 0.         0.05145453 0.
  0.00264757 0.05068739 0.         0.0043604 ]]
[[0.         0.         0.07609448 0.12177809 0.04704269 0.00221301
  0.00579037 0.         0.00221301 0.03610859 0.         0.06238864
  0.00264757 0.         0.10078939 0.0043604 ]]
[[0.0005942  0.12421862]]
[[ 0.19559439 -0.00037736]]
[[0.19618859 0.12384125 0.19500019 0.12459598]]


  self._set_intXint(row, col, x.flat[0])
