In [None]:
# Imports
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import seaborn as plt
import gensim
import nltk
import re
import string
import warnings

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from spacy.matcher import PhraseMatcher

warnings.filterwarnings('ignore')


In [None]:
# Loading the data
matched = pd.read_csv('matched_data.csv')
predicted_matches = pd.read_csv('predicted_matches.csv')
source1_reporting = pd.read_csv('source_1.csv')
source2_reporting = pd.read_csv('source_2.csv')

In [None]:
# Viewing of the reporting data
# Source1
source1_reporting.head(10)

Unnamed: 0,id,name
0,0,"Horses, asses, mules and hinnies; live, pure-b..."
1,1,"Horses; live, pure-bred breeding animals"
2,2,"Horses; live, other than pure-bred breeding an..."
3,4,"Horses, asses, mules and hinnies; live, other ..."
4,5,"Bovine animals; live, pure-bred breeding animals"
5,6,"Cattle; live, pure-bred breeding animals"
6,7,"Cattle; live, other than pure-bred breeding an..."
7,8,"Buffalo; live, pure-bred breeding animals"
8,9,"Buffalo; live, other than pure-bred breeding a..."
9,10,"Bovine animals; live, other than pure-bred bre..."


In [None]:
#Source2
source2_reporting.head(10)

Unnamed: 0,id,name
0,0,leveillula lactucae-serriolae
1,1,podosphaera aphanis
2,2,lathyrus czeczottianus
3,3,crocus biflorus subsp. caricus
4,4,hordeum brevisubulatum
5,5,vinca major subsp. major
6,6,geranium psilostemon
7,7,cantharellaceae
8,8,liatris spicata
9,9,potato pulp


## Data Transformation

In [None]:
# Removing Nan values
# source1.csv
source1_reporting = source1_reporting.dropna()  

# source2.csv
source2_reporting = source2_reporting.dropna()

In [None]:
# Convert the name column in source1 to lowercase
source1_reporting['name'] = source1_reporting['name'].astype(str).str.lower()
source1_reporting.head(10)

Unnamed: 0,id,name
0,0,"horses, asses, mules and hinnies; live, pure-b..."
1,1,"horses; live, pure-bred breeding animals"
2,2,"horses; live, other than pure-bred breeding an..."
3,4,"horses, asses, mules and hinnies; live, other ..."
4,5,"bovine animals; live, pure-bred breeding animals"
5,6,"cattle; live, pure-bred breeding animals"
6,7,"cattle; live, other than pure-bred breeding an..."
7,8,"buffalo; live, pure-bred breeding animals"
8,9,"buffalo; live, other than pure-bred breeding a..."
9,10,"bovine animals; live, other than pure-bred bre..."


In [None]:
# Merging source1_reporting and source2_reporting
reporting_data = pd.merge(source1_reporting, source2_reporting, on='id')
reporting_data.columns = ['id','source1','source2']
reporting_data

Unnamed: 0,id,source1,source2
0,0,"horses, asses, mules and hinnies; live, pure-b...",leveillula lactucae-serriolae
1,0,"wood in the rough, even peeled, or roughly squ...",leveillula lactucae-serriolae
2,1,"horses; live, pure-bred breeding animals",podosphaera aphanis
3,1,"wood in the rough, even peeled, or roughly squ...",podosphaera aphanis
4,2,"horses; live, other than pure-bred breeding an...",lathyrus czeczottianus
...,...,...,...
13149,13064,"data/graphic display tubes,black/white,scr<33,...",aceria erineus
13150,13065,"data/graphic display tubes,black/white,scr>=33...",pond construction
13151,13066,oth.cathode-ray tubes,erbium
13152,13067,other cathode-ray tubes,wetland restoration


In [None]:
# Dropping the duplicates from the id column
reporting_data.drop_duplicates(subset = ['id'], inplace=True, ignore_index=True)
reporting_data

Unnamed: 0,id,source1,source2
0,0,"horses, asses, mules and hinnies; live, pure-b...",leveillula lactucae-serriolae
1,1,"horses; live, pure-bred breeding animals",podosphaera aphanis
2,2,"horses; live, other than pure-bred breeding an...",lathyrus czeczottianus
3,4,"horses, asses, mules and hinnies; live, other ...",hordeum brevisubulatum
4,5,"bovine animals; live, pure-bred breeding animals",vinca major subsp. major
...,...,...,...
12208,13064,"data/graphic display tubes,black/white,scr<33,...",aceria erineus
12209,13065,"data/graphic display tubes,black/white,scr>=33...",pond construction
12210,13066,oth.cathode-ray tubes,erbium
12211,13067,other cathode-ray tubes,wetland restoration


In [None]:
# Dropping the id column
reporting_data.drop(['id'], axis=1, inplace=True)
reporting_data

Unnamed: 0,source1,source2
0,"horses, asses, mules and hinnies; live, pure-b...",leveillula lactucae-serriolae
1,"horses; live, pure-bred breeding animals",podosphaera aphanis
2,"horses; live, other than pure-bred breeding an...",lathyrus czeczottianus
3,"horses, asses, mules and hinnies; live, other ...",hordeum brevisubulatum
4,"bovine animals; live, pure-bred breeding animals",vinca major subsp. major
...,...,...
12208,"data/graphic display tubes,black/white,scr<33,...",aceria erineus
12209,"data/graphic display tubes,black/white,scr>=33...",pond construction
12210,oth.cathode-ray tubes,erbium
12211,other cathode-ray tubes,wetland restoration


In [None]:
# Check the shape of the data
reporting_data.shape

(12213, 2)

## Data Cleaning

In [None]:
# Removing the special characters and symbols from the source1 column
reporting_data['source1'] = reporting_data['source1'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
reporting_data


Unnamed: 0,source1,source2
0,horses asses mules and hinnies live purebred b...,leveillula lactucae-serriolae
1,horses live purebred breeding animals,podosphaera aphanis
2,horses live other than purebred breeding animals,lathyrus czeczottianus
3,horses asses mules and hinnies live other than...,hordeum brevisubulatum
4,bovine animals live purebred breeding animals,vinca major subsp. major
...,...,...
12208,datagraphic display tubesblackwhitescr3356cm14,aceria erineus
12209,datagraphic display tubesblackwhitescr3356cm14,pond construction
12210,othcathoderay tubes,erbium
12211,other cathoderay tubes,wetland restoration


In [None]:
# Removing digits from the source1 column
reporting_data['source1'] = reporting_data['source1'].apply(lambda x: re.sub('\d', '', x))
reporting_data

Unnamed: 0,source1,source2
0,horses asses mules and hinnies live purebred b...,leveillula lactucae-serriolae
1,horses live purebred breeding animals,podosphaera aphanis
2,horses live other than purebred breeding animals,lathyrus czeczottianus
3,horses asses mules and hinnies live other than...,hordeum brevisubulatum
4,bovine animals live purebred breeding animals,vinca major subsp. major
...,...,...
12208,datagraphic display tubesblackwhitescrcm,aceria erineus
12209,datagraphic display tubesblackwhitescrcm,pond construction
12210,othcathoderay tubes,erbium
12211,other cathoderay tubes,wetland restoration


In [None]:
# Tokenizing
nltk.download('punkt')

reporting_data['source1'] = reporting_data['source1'].apply(lambda x: nltk.word_tokenize(x))
#reporting_data['source2'] = reporting_data['source2'].apply(lambda x: nltk.word_tokenize(x))
reporting_data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,source1,source2
0,"[horses, asses, mules, and, hinnies, live, pur...",leveillula lactucae-serriolae
1,"[horses, live, purebred, breeding, animals]",podosphaera aphanis
2,"[horses, live, other, than, purebred, breeding...",lathyrus czeczottianus
3,"[horses, asses, mules, and, hinnies, live, oth...",hordeum brevisubulatum
4,"[bovine, animals, live, purebred, breeding, an...",vinca major subsp. major
...,...,...
12208,"[datagraphic, display, tubesblackwhitescrcm]",aceria erineus
12209,"[datagraphic, display, tubesblackwhitescrcm]",pond construction
12210,"[othcathoderay, tubes]",erbium
12211,"[other, cathoderay, tubes]",wetland restoration


In [None]:
# Removing stop words
nltk.download('stopwords')

def remove_stopwords_func(text):
  t = [token for token in text if token not in stopwords.words('english')]
  text = ' '.join(t)
  return text

# Appling the function to the dataframe
reporting_data['source1'] = reporting_data['source1'].apply(remove_stopwords_func)
#reporting_data['source2'] = reporting_data['source2'].apply(remove_stopwords_func)
reporting_data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,source1,source2
0,horses asses mules hinnies live purebred breed...,leveillula lactucae-serriolae
1,horses live purebred breeding animals,podosphaera aphanis
2,horses live purebred breeding animals,lathyrus czeczottianus
3,horses asses mules hinnies live purebred breed...,hordeum brevisubulatum
4,bovine animals live purebred breeding animals,vinca major subsp. major
...,...,...
12208,datagraphic display tubesblackwhitescrcm,aceria erineus
12209,datagraphic display tubesblackwhitescrcm,pond construction
12210,othcathoderay tubes,erbium
12211,cathoderay tubes,wetland restoration


In [None]:
# Stemminng
stemmer = SnowballStemmer(language='english')
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

reporting_data['source1'] = reporting_data['source1'].astype(str).apply(stem_words)
#reporting_data['source2'] = reporting_data['source2'].astype(str).apply(stem_words)
reporting_data

Unnamed: 0,source1,source2
0,hors ass mule hinni live purebr breed anim,leveillula lactucae-serriolae
1,hors live purebr breed anim,podosphaera aphanis
2,hors live purebr breed anim,lathyrus czeczottianus
3,hors ass mule hinni live purebr breed anim,hordeum brevisubulatum
4,bovin anim live purebr breed anim,vinca major subsp. major
...,...,...
12208,datagraph display tubesblackwhitescrcm,aceria erineus
12209,datagraph display tubesblackwhitescrcm,pond construction
12210,othcathoderay tube,erbium
12211,cathoderay tube,wetland restoration


In [None]:
# Lemmatizing
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

reporting_data['source1'] = reporting_data['source1'].apply(lambda x:lemmatize_words(x))
#reporting_data['source2'] = reporting_data['source2'].apply(lambda x:lemmatize_words(x))
reporting_data

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,source1,source2
0,hors as mule hinni live purebr breed anim,leveillula lactucae-serriolae
1,hors live purebr breed anim,podosphaera aphanis
2,hors live purebr breed anim,lathyrus czeczottianus
3,hors as mule hinni live purebr breed anim,hordeum brevisubulatum
4,bovin anim live purebr breed anim,vinca major subsp. major
...,...,...
12208,datagraph display tubesblackwhitescrcm,aceria erineus
12209,datagraph display tubesblackwhitescrcm,pond construction
12210,othcathoderay tube,erbium
12211,cathoderay tube,wetland restoration


## Matching the data

In [None]:
nlp = spacy.load('en_core_web_lg')
matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
query_doc = [nlp(text) for text in reporting_data['source2']]
matcher.add('QueryList:', query_doc)

In [None]:
# Converting list to string format
list_to_string = ' '.join([str(i) for i in reporting_data['source1']])
doc = nlp(list_to_string)
matched_doc = matcher(doc)
matched_df = pd.DataFrame(matched_doc)
matched_df

Unnamed: 0,0,1,2
0,4663659442224967720,82,83
1,4663659442224967720,120,121
2,4663659442224967720,131,132
3,4663659442224967720,139,140
4,4663659442224967720,166,167
...,...,...,...
1793,4663659442224967720,53098,53099
1794,4663659442224967720,53102,53103
1795,4663659442224967720,53108,53109
1796,4663659442224967720,53191,53192


In [None]:
matched_id, start, end = matched_doc[60]
print(nlp.vocab.strings[matched_id], doc[start:end])

QueryList: fish


In [None]:
doc = nlp(list_to_string)

for matched_id, start, end in matcher(doc):
    string_id = nlp.vocab.strings[matched_id]
    span = doc[start:end]

print(matched_id, string_id, start, end)

4663659442224967720 QueryList: 53480 53481


In [None]:
def getSimilarity(a, b):
    tokens = nlp(a + " " + b)

    print(tokens[0].text, "|",tokens[1].text, "|", tokens[0].similarity(tokens[1]))

    return tokens[0].similarity(tokens[1])


In [None]:
reporting_data['similar_matches'] = reporting_data.apply(lambda x: getSimilarity(x['source1'], x['source2']), axis=1)
reporting_data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
brief | panti | -0.06908832
nightgown | etcof | 0.0
nightgown | etcof | 0.0
nightgown | etcof | 0.0
robe | etcof | 0.0
robe | etcof | 0.0
robe | etcof | 0.0
tshirt | etcof | 0.0
tshirt | etcof | 0.0
nonwoven | aramid | 0.515197
falso | tecido | 0.0
falso | tecido | 0.0
falso | tecido | 0.0
nonwoven | synthartif | 0.0
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
fabric | warp | 0.4364937
cotton | yarn | 0.55605286
cotton | yarn | 0.55605286
carboy | bottl | 0.0
carboy | bottl | 0.0
glass | envelop | 0.15683521
glass | bulb | 0.40346017
glass | envelop | 0.15683521
roll | ironsteel | 0.0
roll | ironsteel | 0.0
roll | ironsteel | 0.0
roll | ironsteel | 0.0
roll |

Unnamed: 0,source1,source2,similar_matches
0,hors as mule hinni live purebr breed anim,leveillula lactucae-serriolae,0.083303
1,hors live purebr breed anim,podosphaera aphanis,0.142983
2,hors live purebr breed anim,lathyrus czeczottianus,0.142983
3,hors as mule hinni live purebr breed anim,hordeum brevisubulatum,0.083303
4,bovin anim live purebr breed anim,vinca major subsp. major,0.000000
...,...,...,...
12208,datagraph display tubesblackwhitescrcm,aceria erineus,0.000000
12209,datagraph display tubesblackwhitescrcm,pond construction,0.000000
12210,othcathoderay tube,erbium,0.000000
12211,cathoderay tube,wetland restoration,0.000000


In [None]:
reporting_data.head(5)

Unnamed: 0,source1,source2,similar_matches
0,hors as mule hinni live purebr breed anim,leveillula lactucae-serriolae,0.083303
1,hors live purebr breed anim,podosphaera aphanis,0.142983
2,hors live purebr breed anim,lathyrus czeczottianus,0.142983
3,hors as mule hinni live purebr breed anim,hordeum brevisubulatum,0.083303
4,bovin anim live purebr breed anim,vinca major subsp. major,0.0


In [None]:
all_values = []
for column in reporting_data:
  column_values = reporting_data['similar_matches'].tolist()
  all_values += column_values

df_matches = pd.DataFrame(all_values)
print(df_matches)

              0
0      0.083303
1      0.142983
2      0.142983
3      0.083303
4      0.000000
...         ...
36634  0.000000
36635  0.000000
36636  0.000000
36637  0.000000
36638  0.000000

[36639 rows x 1 columns]


In [None]:
df_matched = reporting_data.filter(['similar_matches'], axis =1)
df_matched

Unnamed: 0,similar_matches
0,0.083303
1,0.142983
2,0.142983
3,0.083303
4,0.000000
...,...
12208,0.000000
12209,0.000000
12210,0.000000
12211,0.000000


In [None]:
df_matched.to_csv('final_matched_df.csv')