# Video Data EM Notebook

In [1]:
import sys
import py_entitymatching as em
import pandas as pd
import os
import time

###################################################################
#KEY!!!! VARIABLE PREVENTS OVERWRITING LABELED SAMPLED DATA
###################################################################
GENERATE_NEW_LABELED_DATA = False
GENERATE_NEW_TEST_TRAIN_DATA = False



In [2]:
# Display the versions
print('python version: ' + sys.version )
print('pandas version: ' + pd.__version__ )
print('magellan version: ' + em.__version__ )

python version: 2.7.14 |Anaconda, Inc.| (default, Dec  7 2017, 11:07:58) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
pandas version: 0.22.0
magellan version: 0.3.0


Matching two tables typically consists of the following three steps:

** 1. Reading the input tables **

** 2. Blocking the input tables to get a candidate set **

** 3. Matching the tuple pairs in the candidate set **

# Read input tables

In [3]:
# Get the paths
path_A = '../DATA/imdb3_neg_nan.csv'
path_B = '../DATA/thenumbers3_neg_nan.csv'
print(path_A)

../DATA/imdb3_neg_nan.csv


In [4]:
# Load csv files as dataframes and set the key attribute in the dataframe
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
A.head()

No handlers could be found for logger "py_entitymatching.io.parsers"


Unnamed: 0,id,title,year,mpaa,runtime,genres,director,stars,gross
0,0,Toy Story 2,1999,G,92 min,"Animation, Adventure, Comedy",John Lasseter,"Ash Brannon,Lee Unkrich,Tom Hanks,Tim Allen,Joan Cusack,Kelsey Grammer",$245.85M
1,1,The Outlaw Josey Wales,1976,PG,135 min,Western,Clint Eastwood,"Clint Eastwood,Sondra Locke,Chief Dan George,Bill McKinney",$31.80M
2,2,"Monsters, Inc.",2001,G,92 min,"Animation, Adventure, Comedy",Pete Docter,"David Silverman,Lee Unkrich,Billy Crystal,John Goodman,Mary Gibbs,Steve Buscemi",$289.92M
3,3,In the Heat of the Night,1967,Not Rated,110 min,"Crime, Drama, Mystery",Norman Jewison,"Sidney Poitier,Rod Steiger,Warren Oates,Lee Grant",$24.38M
4,4,Chungking Express,1994,PG-13,102 min,"Crime, Drama, Romance",Kar-Wai Wong,"Brigitte Lin,Takeshi Kaneshiro,Tony Chiu-Wai Leung,Faye Wong",$0.60M


In [5]:
B.head()

Unnamed: 0,id,title,year,mpaa,runtime,genres,director,stars,gross
0,0,Effects,2005,-1,-1,-1,Dusty Nelson,-1,-1
1,1,Ek Haseena Thi Ek Deewana Tha,2017,-1,105 minutes,Drama,Suneel Darshan,-1,"$149,491"
2,2,Ekk Albela,2016,-1,-1,Drama,Shekhar Sartandel,-1,"$1,907"
3,3,The Daisy Chain,2009,R,-1,-1,-1,-1,-1
4,4,Dali & I: The Surreal Story,2011,-1,-1,Drama,Andrew Niccol,Al Pacino,-1


### Check for null entries

In [6]:
B.isnull().sum()

id          0
title       0
year        0
mpaa        0
runtime     0
genres      0
director    0
stars       0
gross       0
dtype: int64

In [7]:
A.isnull().sum()

id          0
title       0
year        0
mpaa        0
runtime     0
genres      0
director    0
stars       0
gross       0
dtype: int64

### Check the data size

In [8]:
print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A)*len(B)))

Number of tuples in A: 4291
Number of tuples in B: 31006
Number of tuples in A X B (i.e the cartesian product): 133046746


In [9]:
# Display the keys of the input tables
em.get_key(A), em.get_key(B)

('id', 'id')

In [10]:
# If the tables are large we can downsample the tables like this
A1, B1 = em.down_sample(A, B, 200, 1, show_progress=False)
len(A1), len(B1)

# But for the purposes of this notebook, we will use the entire table A and B

(127, 200)

# Block tables to get candidate set

Before we do the matching, we would like to remove the obviously non-matching tuple pairs from the input tables to reduce computational complexity. This would reduce the number of tuple pairs considered for matching.
*py_entitymatching* provides four different blockers: (1) attribute equivalence, (2) overlap, (3) rule-based, and (4) black-box. We can use a mix and match of these blockers to form a blocking sequence applied to input tables.

For the matching problem at hand, we know that two movies with different titles will not match. So we decide the apply blocking over names:

In [11]:
# Initial Blocking plan

# A, B -- overlap blocker [title] --------------------|---> candidate set

In [12]:
# Create an overlap blocker
ab = em.OverlapBlocker()

# Block using 'title' attribute
C1 = ab.block_tables(A, B, 'title', 'title', 
                    l_output_attrs=['title', 'year', 'mpaa', 'runtime', 'genres', 'director', 'stars', 'gross'], 
                    r_output_attrs=['title', 'year', 'mpaa', 'runtime', 'genres', 'director', 'stars', 'gross'],
                    overlap_size=1
                    )

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:49


In [13]:
len(C1)

10956134

## Debug blocker output

The number of tuple pairs considered for matching is reduced to 10,956,134 (from 133,046,746, approx. 8%), but we would want to make sure that the blocker did not drop any potential matches. We could debug the blocker output in *py_entitymatching* as follows:

In [14]:
# Debug blocker output
startTime = time.time()
dbg = em.debug_blocker(C1, A, B, output_size=200, attr_corres=[('title','title'), ('year', 'year')])
endTime = time.time()
print("Total time: %.2f seconds."%(endTime-startTime))

Total time: 48.54 seconds.


In [15]:
# Display first few tuple pairs from the debug_blocker's output
dbg.head(50)

Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,rtable_title,rtable_year
0,0,2965,17445,Advise & Consent,1962,Ginger & Rosa,2013
1,1,2965,17842,Advise & Consent,1962,Harry & Son,1984
2,2,2965,17844,Advise & Consent,1962,Harry & Snowman,2016
3,3,2965,19318,Advise & Consent,1962,Lambert & Stamp,2015
4,4,2965,19600,Advise & Consent,1962,David & Layla,2007
5,5,2965,19796,Advise & Consent,1962,Hansel & Gretel,2002
6,6,2965,20161,Advise & Consent,1962,Q & A,1990
7,7,2965,23293,Advise & Consent,1962,Starsky & Hutch,2004
8,8,2965,23439,Advise & Consent,1962,Spring & Arnaud,2016
9,9,2965,25285,Advise & Consent,1962,Town & Country,2001


From the debug blocker's output we observe that the current blocker retains quite a lot of obvious mismatches. We would want to update the blocking sequence to drop few more of these obvious mismatches.

For the considered dataset, we know that for the movies to match the year of release must overlap between them. We could use overlap blocker for this purpose. Finally, we would want to find the intersection of the outputs from the two overlap blockers to get a consolidated candidate set.

In [16]:
# Updated blocking sequence
# A, B ------ overlap blocker [title] -----> C1--
#                                                     
# C1 ------ overlap blocker [year] --------> C2--

In [17]:
# Create overlap blocker
ob = em.OverlapBlocker()

# Block tables using 'year' attribute 
C2 = ob.block_candset(C1, 'year', 'year' 
                    )
len(C2)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:46


134228

In [18]:
# Display first two rows from C2
C2.head(2)

Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,ltable_mpaa,ltable_runtime,ltable_genres,ltable_director,ltable_stars,ltable_gross,rtable_title,rtable_year,rtable_mpaa,rtable_runtime,rtable_genres,rtable_director,rtable_stars,rtable_gross
34,34,45,3,The Hangover,2009,R,100 min,Comedy,Todd Phillips,"Zach Galifianakis,Bradley Cooper,Justin Bartha,Ed Helms",$277.32M,The Daisy Chain,2009,R,-1,-1,-1,-1,-1
144,144,4286,3,The Loved Ones,2009,R,84 min,"Horror, Thriller",Sean Byrne,"Xavier Samuel,Robin McLeavy,Victoria Thaine,Jessica McNamee",-1,The Daisy Chain,2009,R,-1,-1,-1,-1,-1


We similarily reduce number of potential tuple pairs by matching mpaa ratings of the movies as well.

In [19]:
# Updated blocking sequence
# A, B ------ overlap blocker [title] -----> C1--
#                                                     
# C1 ------ overlap blocker [year] --------> C2--
#
# C2 ------ overlap blocker [mpaa] --------> C3--

In [20]:
# Create overlap blocker
ob = em.OverlapBlocker()

# Block tables using 'name' attribute 
C3 = ob.block_candset(C2, 'mpaa', 'mpaa' 
                    )
len(C3)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


27767

In [21]:
# Display first two rows from C3
C3.head(2)

Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,ltable_mpaa,ltable_runtime,ltable_genres,ltable_director,ltable_stars,ltable_gross,rtable_title,rtable_year,rtable_mpaa,rtable_runtime,rtable_genres,rtable_director,rtable_stars,rtable_gross
34,34,45,3,The Hangover,2009,R,100 min,Comedy,Todd Phillips,"Zach Galifianakis,Bradley Cooper,Justin Bartha,Ed Helms",$277.32M,The Daisy Chain,2009,R,-1,-1,-1,-1,-1
144,144,4286,3,The Loved Ones,2009,R,84 min,"Horror, Thriller",Sean Byrne,"Xavier Samuel,Robin McLeavy,Victoria Thaine,Jessica McNamee",-1,The Daisy Chain,2009,R,-1,-1,-1,-1,-1


We observe that a lot of potential matches in our first overlap blocker are due to the word *the* being matched between two movie titles. We wish to filter out such tuple pairs. Hence we write a customized black box blocker additionally into the pipeline.

In [22]:
# Updated blocking sequence
# A, B ------ overlap blocker [title] -----> C1--
#                                                     
# C1 ------ overlap blocker [year] --------> C2--
#
# C2 ------ overlap blocker [mpaa] --------> C3--
#
# C3 ------ black box [title match more than just 'the'] --------> C4--

In [23]:
# Create blocker to block tuples that only match based on 'the'.
def overlap_ignoring_words(ltuple, rtuple):
    words = set(['the'])
    # Remove ignore words set from strings.
    lostring = ltuple['title'].lower().split()
    rostring = rtuple['title'].lower().split()
    l_tokens = []
    r_tokens = []
    for word in lostring:
        if word.strip() not in words:
            l_tokens.append(word)
    for word in rostring:
        if word.strip() not in words:
            r_tokens.append(word)
    l_tokens = set(l_tokens)
    r_tokens = set(r_tokens)
        
    # Compute overlap.
    #l_tokens = set(list(map(lambda item: item.strip(), lstring.split())))
    #r_tokens = set(list(map(lambda item: item.strip(), rstring.split())))
    intersection = l_tokens.intersection(r_tokens)
    if len(intersection) >= 1:
        return False
    else:
        return True
    
# Create and apply blocker.
bb = em.BlackBoxBlocker()
bb.set_black_box_function(overlap_ignoring_words)
C4 = bb.block_candset(C3)    
C4.head()
    

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,ltable_mpaa,ltable_runtime,ltable_genres,ltable_director,ltable_stars,ltable_gross,rtable_title,rtable_year,rtable_mpaa,rtable_runtime,rtable_genres,rtable_director,rtable_stars,rtable_gross
4146,4146,1757,11,Hamlet,1996,PG-13,242 min,Drama,Kenneth Branagh,"Kenneth Branagh,Julie Christie,Derek Jacobi,Kate Winslet",$4.41M,Hamlet,1996,PG-13,-1,Drama,-1,Kenneth Branagh,"$7,129,670"
6671,6671,1093,20,The Assassination of Jesse James by the Coward Robert Ford,2007,R,160 min,"Biography, Crime, Drama",Andrew Dominik,"Brad Pitt,Casey Affleck,Sam Shepard,Mary-Louise Parker",$3.90M,Home of the Brave,2007,R,106 minutes,Drama,Irwin Winkler,"Samuel L. Jackson,Jessica Biel,Brian Presley,Curtis ""50 Cent"" Jackson",-1
6891,6891,1723,20,Sweeney Todd: The Demon Barber of Fleet Street,2007,R,116 min,"Drama, Horror, Musical",Tim Burton,"Johnny Depp,Helena Bonham Carter,Alan Rickman,Timothy Spall",$52.90M,Home of the Brave,2007,R,106 minutes,Drama,Irwin Winkler,"Samuel L. Jackson,Jessica Biel,Brian Presley,Curtis ""50 Cent"" Jackson",-1
7619,7619,3795,20,30 Days of Night,2007,R,113 min,"Horror, Thriller",David Slade,"Josh Hartnett,Melissa George,Danny Huston,Ben Foster",$39.57M,Home of the Brave,2007,R,106 minutes,Drama,Irwin Winkler,"Samuel L. Jackson,Jessica Biel,Brian Presley,Curtis ""50 Cent"" Jackson",-1
7647,7647,3883,20,In the Valley of Elah,2007,R,121 min,"Crime, Drama, Mystery",Paul Haggis,"Tommy Lee Jones,Charlize Theron,Jonathan Tucker,Jason Patric",$6.78M,Home of the Brave,2007,R,106 minutes,Drama,Irwin Winkler,"Samuel L. Jackson,Jessica Biel,Brian Presley,Curtis ""50 Cent"" Jackson",-1


In [24]:
len(C4)
#Save all tuple pairs remainng after blocking step
em.to_csv_metadata(C4, '../DATA/AllTuplePairs.csv')

True

We can proceed with the matching step now.


# Matching tuple pairs in the candidate set

In this step, we would want to match the tuple pairs in the candidate set. Specifically, we use learning-based method for matching purposes.
This typically involves the following five steps:
1. Sampling and labeling the candidate set
2. Splitting the labeled data into development and evaluation set
3. Selecting the best learning based matcher using the development set
4. Evaluating the selected matcher using the evaluation set

## Sampling and labeling the candidate set

First, we randomly sample 600 tuple pairs for labeling purposes.

In [25]:
# Sample  candidate set
S = em.sample_table(C4, 600)
S.head()

Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,ltable_mpaa,ltable_runtime,ltable_genres,ltable_director,ltable_stars,ltable_gross,rtable_title,rtable_year,rtable_mpaa,rtable_runtime,rtable_genres,rtable_director,rtable_stars,rtable_gross
4146,4146,1757,11,Hamlet,1996,PG-13,242 min,Drama,Kenneth Branagh,"Kenneth Branagh,Julie Christie,Derek Jacobi,Kate Winslet",$4.41M,Hamlet,1996,PG-13,-1,Drama,-1,Kenneth Branagh,"$7,129,670"
7619,7619,3795,20,30 Days of Night,2007,R,113 min,"Horror, Thriller",David Slade,"Josh Hartnett,Melissa George,Danny Huston,Ben Foster",$39.57M,Home of the Brave,2007,R,106 minutes,Drama,Irwin Winkler,"Samuel L. Jackson,Jessica Biel,Brian Presley,Curtis ""50 Cent"" Jackson",-1
11238,11238,1686,26,A Very Long Engagement,2004,R,133 min,"Drama, Mystery, Romance",Jean-Pierre Jeunet,"Audrey Tautou,Gaspard Ulliel,Jodie Foster,Dominique Pinon",$6.17M,A Home at the End of the World,2004,R,-1,Drama,-1,-1,"$1,033,810"
34365,34365,46,68,Harry Potter and the Prisoner of Azkaban,2004,PG,142 min,"Adventure, Family, Fantasy",Alfonso Cuarón,"Daniel Radcliffe,Emma Watson,Rupert Grint,Richard Griffiths",$249.36M,Cowboys and Angels,2004,PG,105 minutes,Comedy,-1,-1,-1
81477,81477,3511,168,Kiss of the Dragon,2001,R,98 min,"Action, Crime, Drama",Chris Nahon,"Jet Li,Bridget Fonda,Tchéky Karyo,Max Ryan",$36.85M,The Business of Strangers,2001,R,-1,Drama,-1,-1,"$1,290,920"


In [26]:
if GENERATE_NEW_LABELED_DATA:
    # Label S interactively. 
    G = em.label_table(S, 'gold')
    # SAVE! TIS A BITCH TO LABEL S AGAIN, AND IT NEEDS TO BE LABELED WITHIN THE UI!
    em.to_csv_metadata(G, '../DATA/G.csv')
else:
    # Load the dataset.
    G = em.read_csv_metadata('../DATA/G.csv', 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')


## Splitting the labeled data into development and evaluation set

In this step, we split the labeled data into two sets: development (I) and evaluation (J). Specifically, the development set is used to come up with the best learning-based matcher and the evaluation set used to evaluate the selected matcher on unseen data.

In [27]:
if GENERATE_NEW_TEST_TRAIN_DATA:
    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=5)
    I = IJ['train']
    J = IJ['test']
    print(len(I[I['gold']==1]))
    print(len(J[J['gold']==1]))
    em.to_csv_metadata(I, '../DATA/I.csv')
    em.to_csv_metadata(J, '../DATA/J.csv')
else:
    # Load the dataset.
    I = em.read_csv_metadata('../DATA/I.csv', 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')
    J = em.read_csv_metadata('../DATA/J.csv', 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

## Selecting the best learning-based matcher 

Selecting the best learning-based matcher typically involves the following steps:

1. Creating a set of learning-based matchers
2. Creating features
3. Converting the development set into feature vectors
4. Selecting the best learning-based matcher using k-fold cross validation

### Creating a set of learning-based matchers

In [28]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

### Creating features

Next, we need to create a set of features for the development set. *py_entitymatching* provides a way to automatically generate features based on the attributes in the input tables.

In [29]:
# Generate features
feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [30]:
# List the names of the features generated
feature_table['feature_name']

0                                 id_id_exm
1                                 id_id_anm
2                            id_id_lev_dist
3                             id_id_lev_sim
4               title_title_jac_qgm_3_qgm_3
5           title_title_cos_dlm_dc0_dlm_dc0
6           title_title_jac_dlm_dc0_dlm_dc0
7                           title_title_mel
8                      title_title_lev_dist
9                       title_title_lev_sim
10                          title_title_nmw
11                           title_title_sw
12                mpaa_mpaa_jac_qgm_3_qgm_3
13            mpaa_mpaa_cos_dlm_dc0_dlm_dc0
14            mpaa_mpaa_jac_dlm_dc0_dlm_dc0
15                            mpaa_mpaa_mel
16                       mpaa_mpaa_lev_dist
17                        mpaa_mpaa_lev_sim
18                            mpaa_mpaa_nmw
19                             mpaa_mpaa_sw
20          runtime_runtime_jac_qgm_3_qgm_3
21      runtime_runtime_cos_dlm_dc0_dlm_dc0
22      runtime_runtime_jac_dlm_

### Converting the development set to feature vectors

In [31]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=feature_table, 
                            attrs_after='gold',
                            show_progress=False)

In [32]:
# Display first few rows
H.head(3)

Unnamed: 0,_id,ltable_id,rtable_id,id_id_exm,id_id_anm,id_id_lev_dist,id_id_lev_sim,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0,title_title_jac_dlm_dc0_dlm_dc0,...,director_director_lev_sim,director_director_nmw,director_director_sw,gross_gross_lev_dist,gross_gross_lev_sim,gross_gross_jar,gross_gross_jwn,gross_gross_exm,gross_gross_jac_qgm_3_qgm_3,gold
0,1655476,1546,4724,0,0.327265,4,0.0,0.111111,0.408248,0.25,...,0.25,2.0,3.0,7.0,0.0,0.0,0.0,0,0.0,0
1,6390810,4237,17373,0,0.243884,3,0.4,0.086207,0.338062,0.2,...,0.133333,-3.0,1.0,9.0,0.181818,0.419192,0.477273,0,0.05,0
2,8382574,1065,23505,0,0.04531,4,0.2,0.0,0.0,0.0,...,0.0,-10.0,0.0,6.0,0.142857,0.547619,0.547619,0,0.0,0


### Selecting the best matcher using cross-validation

Now, we select the best matcher using k-fold cross-validation. We use five fold cross validation and use 'F1 score' metric to select the best matcher.

In [33]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
        k=5,
        target_attr='gold', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']#Training set performance measure using internal CV



Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.992857,0.96819,0.98008
1,RF,0.992857,0.984857,0.988776
2,SVM,0.989474,0.747095,0.848269
3,LinReg,0.992857,0.976857,0.984524
4,LogReg,0.993103,0.983333,0.987796
5,NaiveBayes,0.992857,0.976857,0.984524


Clearly, the random forrest model outperforms others. 

##  Evaluating the matching output

Evaluating the matching outputs for the evaluation set typically involves the following four steps:
1. Converting the evaluation set to feature vectors
2. Training matcher using the feature vectors extracted from the development set
3. Predicting the evaluation set using the trained matcher
4. Evaluating the predicted matches

### Converting the evaluation set to  feature vectors

As before, we convert to the feature vectors (using the feature table and the evaluation set)

In [34]:
# Convert J into a set of feature vectors using feature table
L = em.extract_feature_vecs(J, feature_table=feature_table,
                            attrs_after='gold', show_progress=False)

### Training the selected matcher

Now, we train the matcher using all of the feature vectors from the development set. The selected model is Random Forrest.

In [35]:
# Train using feature vectors from I 
rf.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')

### Predicting the matches

Next, we predict the matches for the evaluation set (using the feature vectors extracted from it).

In [36]:
# Predict on L 
predictions = rf.predict(table=L,#The entire evaluation set
                         exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)

### Evaluating the predictions

Finally, we evaluate the accuracy of predicted outputs

In [37]:
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold', 'predicted')
em.print_eval_summary(eval_result)

Precision : 100.0% (46/46)
Recall : 100.0% (46/46)
F1 : 100.0%
False positives : 0 (out of 46 positive predictions)
False negatives : 0 (out of 134 negative predictions)


### Evaluating all models

Train all models

In [38]:
# Train using feature vectors from I 
dt.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')
# Train using feature vectors from I 
rf.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')
# Train using feature vectors from I 
svm.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')
# Train using feature vectors from I 
lg.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')
# Train using feature vectors from I 
ln.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')
# Train using feature vectors from I 
nb.fit(table=H, #the entire development set
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
       target_attr='gold')






Perform predictions for every model and output results

In [39]:
# Predict on L
models = [dt, rf, svm, ln, lg, nb]
modelNames = ['dt', 'rf', 'svm', 'ln', 'lg', 'nb']

for i, model in enumerate(models):
    predictions = model.predict(table=L,#The entire evaluation set
                         exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)
    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
    print(modelNames[i])
    em.print_eval_summary(eval_result)
    print('\n')
    

dt
Precision : 95.65% (44/46)
Recall : 95.65% (44/46)
F1 : 95.65%
False positives : 2 (out of 46 positive predictions)
False negatives : 2 (out of 134 negative predictions)


rf
Precision : 100.0% (46/46)
Recall : 100.0% (46/46)
F1 : 100.0%
False positives : 0 (out of 46 positive predictions)
False negatives : 0 (out of 134 negative predictions)


svm
Precision : 100.0% (32/32)
Recall : 69.57% (32/46)
F1 : 82.05%
False positives : 0 (out of 32 positive predictions)
False negatives : 14 (out of 148 negative predictions)


ln
Precision : 100.0% (46/46)
Recall : 100.0% (46/46)
F1 : 100.0%
False positives : 0 (out of 46 positive predictions)
False negatives : 0 (out of 134 negative predictions)


lg
Precision : 100.0% (46/46)
Recall : 100.0% (46/46)
F1 : 100.0%
False positives : 0 (out of 46 positive predictions)
False negatives : 0 (out of 134 negative predictions)


nb
Precision : 100.0% (46/46)
Recall : 100.0% (46/46)
F1 : 100.0%
False positives : 0 (out of 46 positive predictions)
Fals