# Supermind Data Science Assignment  : Crypto Detection 

### Name : Devang Papinwar
### Email : papinwardevang@gmail.com
### Contact No : 7420039018

### Importing Libraries

In [352]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import contractions
from math import sqrt, pow, exp
from sklearn.feature_extraction.text import CountVectorizer
sns.set()
import spacy
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

### Preparing Dataset

In [353]:
definition = pd.read_csv('term_def.csv')
definition

Unnamed: 0,terms,definition1,definition2
0,51% attack,A hypothetical situation where more than half ...,
1,51% attack protection,A protection mechanism implemented by several ...,
2,AFK,Away From Keyboard; used on social media platf...,
3,Airdrop,An event where a blockchain project distribute...,
4,Altcoin,Any cryptocurrency that is an alternative to B...,
...,...,...,...
155,Vyper,A Python-like programming language for the Eth...,
156,Wallet (Cold),A wallet disconnected from the internet.,
157,Wallet (Hot),A wallet connected to the Internet.,
158,Wallet (Multisignature),A wallet that requires multiple digital signat...,


In [354]:
data = pd.DataFrame()
data['sentence'] = definition['definition1']
data

Unnamed: 0,sentence
0,A hypothetical situation where more than half ...
1,A protection mechanism implemented by several ...
2,Away From Keyboard; used on social media platf...
3,An event where a blockchain project distribute...
4,Any cryptocurrency that is an alternative to B...
...,...
155,A Python-like programming language for the Eth...
156,A wallet disconnected from the internet.
157,A wallet connected to the Internet.
158,A wallet that requires multiple digital signat...


### Preprocess the data

In [355]:
def text_preprocessing(text):
    # Removing contents of tags and all for further text processing
    data['sentence'].replace(regex=True,inplace=True, to_replace= r'<.+?>', value=r' ')
    
    # Removing the \t charecter from the descriptions field
    data['sentence'] = data['sentence'].str.replace(r'\n', ' ')

    # Removing links from all for further text processing
    data['sentence'].replace(regex=True,inplace=True, to_replace= r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', value=r' ')
    
    # Removing rows with empty columns 
    data.dropna(subset=['sentence'],inplace=True)
    
    return data

In [356]:
data = text_preprocessing(data)
data

  data['sentence'] = data['sentence'].str.replace(r'\n', ' ')


Unnamed: 0,sentence
0,A hypothetical situation where more than half ...
1,A protection mechanism implemented by several ...
2,Away From Keyboard; used on social media platf...
3,An event where a blockchain project distribute...
4,Any cryptocurrency that is an alternative to B...
...,...
155,A Python-like programming language for the Eth...
156,A wallet disconnected from the internet.
157,A wallet connected to the Internet.
158,A wallet that requires multiple digital signat...


### Preprocess the Data removing stopwords , tokenization , lowering charecters

In [357]:
# spacy model:
nlp = en_core_web_md.load()

# Tags to remove from the text
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
tokens = []

for summary in nlp.pipe(data['sentence']):
   proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
   tokens.append(proj_tok)
    
tokens

[['hypothetical',
  'situation',
  'half',
  'computing',
  'power',
  'blockchain',
  'network',
  'control',
  'person',
  'group',
  'allow',
  'dictate',
  'transaction',
  'verify',
  'allow',
  'prevent',
  'user',
  'complete',
  'confirm',
  'transaction',
  'cause',
  'havoc',
  'system',
  'double',
  'spend',
  'coin'],
 ['protection',
  'mechanism',
  'implement',
  'cryptocurrencie',
  'require',
  'total',
  'hashing',
  'power',
  'work',
  'entity',
  'difficult',
  'attacker',
  'need',
  'resource',
  'time',
  'threshold',
  'have',
  'additional',
  'safeguard',
  'feature',
  'agree',
  'transaction',
  'send',
  'make',
  'unable',
  'double',
  'spend',
  'notice',
  'change',
  'chain'],
 ['keyboard',
  'social',
  'medium',
  'platform',
  'like',
  'twitter',
  'user',
  'share',
  'trading',
  'activity',
  'want',
  'receive',
  'message',
  'log',
  'account',
  'thing',
  'afks',
  'trade',
  'extended',
  'period',
  'time',
  'active',
  'feed'],
 ['even

### Create a new column for tokens

In [358]:
data["tokens"] = tokens
data

Unnamed: 0,sentence,tokens
0,A hypothetical situation where more than half ...,"[hypothetical, situation, half, computing, pow..."
1,A protection mechanism implemented by several ...,"[protection, mechanism, implement, cryptocurre..."
2,Away From Keyboard; used on social media platf...,"[keyboard, social, medium, platform, like, twi..."
3,An event where a blockchain project distribute...,"[event, blockchain, project, distribute, free,..."
4,Any cryptocurrency that is an alternative to B...,"[cryptocurrency, alternative, bitcoin]"
...,...,...
155,A Python-like programming language for the Eth...,"[python, like, programming, language, ethereum..."
156,A wallet disconnected from the internet.,"[wallet, disconnect, internet]"
157,A wallet connected to the Internet.,"[wallet, connect, internet]"
158,A wallet that requires multiple digital signat...,"[wallet, require, multiple, digital, signature..."


### Create a Dictionary from the Bag of Words

In [359]:
dictionary = Dictionary(data['tokens'])
print(dictionary.token2id)

{'allow': 0, 'blockchain': 1, 'cause': 2, 'coin': 3, 'complete': 4, 'computing': 5, 'confirm': 6, 'control': 7, 'dictate': 8, 'double': 9, 'group': 10, 'half': 11, 'havoc': 12, 'hypothetical': 13, 'network': 14, 'person': 15, 'power': 16, 'prevent': 17, 'situation': 18, 'spend': 19, 'system': 20, 'transaction': 21, 'user': 22, 'verify': 23, 'additional': 24, 'agree': 25, 'attacker': 26, 'chain': 27, 'change': 28, 'cryptocurrencie': 29, 'difficult': 30, 'entity': 31, 'feature': 32, 'hashing': 33, 'have': 34, 'implement': 35, 'make': 36, 'mechanism': 37, 'need': 38, 'notice': 39, 'protection': 40, 'require': 41, 'resource': 42, 'safeguard': 43, 'send': 44, 'threshold': 45, 'time': 46, 'total': 47, 'unable': 48, 'work': 49, 'account': 50, 'active': 51, 'activity': 52, 'afks': 53, 'extended': 54, 'feed': 55, 'keyboard': 56, 'like': 57, 'log': 58, 'medium': 59, 'message': 60, 'period': 61, 'platform': 62, 'receive': 63, 'share': 64, 'social': 65, 'thing': 66, 'trade': 67, 'trading': 68, 'tw

In [360]:
# Generate a corpus from the BagofWords Dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['tokens']]
corpus

[[(0, 2),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1),
  (23, 1)],
 [(9, 1),
  (16, 1),
  (19, 1),
  (21, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1)],
 [(22, 1),
  (46, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1)],
 [(1, 1), (3, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1)],
 [(77, 1), (78, 1), (79, 1)],
 [(57, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (

### Create a Model for topic modeling ie. Latent Dirichlet Allocation

In [361]:
# Train the unsupervised machine learning model on the data
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

In [362]:
# Print the topics modeled from the data
lda_model.print_topics(0)

[(3,
  '0.035*"blockchain" + 0.030*"miner" + 0.021*"burn" + 0.020*"private" + 0.020*"currency" + 0.019*"key" + 0.017*"block" + 0.014*"store" + 0.014*"cryptocurrency" + 0.014*"datum"'),
 (1,
  '0.020*"blockchain" + 0.015*"bitcoin" + 0.013*"block" + 0.011*"exchange" + 0.011*"system" + 0.011*"hash" + 0.011*"process" + 0.010*"use" + 0.009*"network" + 0.009*"application"'),
 (4,
  '0.032*"transaction" + 0.025*"network" + 0.021*"blockchain" + 0.015*"allow" + 0.014*"ledger" + 0.013*"verify" + 0.013*"control" + 0.010*"information" + 0.010*"process" + 0.010*"computing"'),
 (7,
  '0.031*"token" + 0.024*"coin" + 0.024*"transaction" + 0.019*"blockchain" + 0.012*"mining" + 0.011*"wallet" + 0.011*"miner" + 0.010*"require" + 0.009*"block" + 0.009*"purpose"'),
 (2,
  '0.017*"go" + 0.013*"price" + 0.012*"system" + 0.012*"cryptocurrency" + 0.012*"asset" + 0.012*"particular" + 0.012*"strategy" + 0.012*"long" + 0.012*"short" + 0.007*"coin"'),
 (8,
  '0.017*"like" + 0.015*"trade" + 0.012*"system" + 0.012*"

In [363]:
# Visualising the topics modeled from the data
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(


### Create a Bag of Words from the Latent Dirichlet Allocation's modeled topics

In [364]:
# For each topic modeled show the high probability words
for idx, topic in lda_model.print_topics(0):
   print('Topic: {} \nWords: {}'.format(idx, topic))

# Generate the number of sub topics for main topics generated
topics = lda_model.print_topics(idx, 20)
topics

Topic: 5 
Words: 0.034*"price" + 0.030*"buy" + 0.025*"coin" + 0.024*"investor" + 0.023*"sell" + 0.019*"market" + 0.016*"asset" + 0.015*"blockchain" + 0.013*"like" + 0.013*"transaction"
Topic: 9 
Words: 0.022*"transaction" + 0.019*"smart" + 0.017*"contract" + 0.015*"support" + 0.014*"function" + 0.013*"wallet" + 0.011*"standard" + 0.011*"define" + 0.011*"token" + 0.011*"crypto"
Topic: 6 
Words: 0.017*"computer" + 0.016*"system" + 0.014*"item" + 0.014*"node" + 0.013*"like" + 0.012*"example" + 0.011*"fungible" + 0.011*"share" + 0.009*"blockchain" + 0.009*"user"
Topic: 7 
Words: 0.031*"token" + 0.024*"coin" + 0.024*"transaction" + 0.019*"blockchain" + 0.012*"mining" + 0.011*"wallet" + 0.011*"miner" + 0.010*"require" + 0.009*"block" + 0.009*"purpose"
Topic: 8 
Words: 0.017*"like" + 0.015*"trade" + 0.012*"system" + 0.012*"create" + 0.011*"value" + 0.010*"sell" + 0.010*"hold" + 0.010*"token" + 0.010*"ethereum" + 0.009*"buy"
Topic: 1 
Words: 0.020*"blockchain" + 0.015*"bitcoin" + 0.013*"block"

[(8,
  '0.017*"like" + 0.015*"trade" + 0.012*"system" + 0.012*"create" + 0.011*"value" + 0.010*"sell" + 0.010*"hold" + 0.010*"token" + 0.010*"ethereum" + 0.009*"buy" + 0.009*"network" + 0.009*"price" + 0.009*"cryptocurrency" + 0.009*"product" + 0.009*"process" + 0.009*"different" + 0.009*"multiple" + 0.009*"copy" + 0.009*"datum" + 0.009*"node"'),
 (0,
  '0.018*"blockchain" + 0.018*"network" + 0.014*"transaction" + 0.014*"user" + 0.011*"cryptocurrency" + 0.011*"activity" + 0.011*"allow" + 0.011*"chain" + 0.011*"term" + 0.011*"crypto" + 0.011*"slang" + 0.011*"accessible" + 0.008*"asset" + 0.007*"block" + 0.007*"product" + 0.007*"service" + 0.007*"include" + 0.007*"hold" + 0.007*"investor" + 0.007*"wallet"'),
 (1,
  '0.020*"blockchain" + 0.015*"bitcoin" + 0.013*"block" + 0.011*"exchange" + 0.011*"system" + 0.011*"hash" + 0.011*"process" + 0.010*"use" + 0.009*"network" + 0.009*"application" + 0.009*"node" + 0.009*"identity" + 0.009*"cryptographic" + 0.007*"coin" + 0.007*"poa" + 0.007*"labe

### Clean the Bag of Words

In [365]:
topics1 = []
for idx , topic in topics:
    topics1.append(topic.split('+'))
topics1

[['0.017*"like" ',
  ' 0.015*"trade" ',
  ' 0.012*"system" ',
  ' 0.012*"create" ',
  ' 0.011*"value" ',
  ' 0.010*"sell" ',
  ' 0.010*"hold" ',
  ' 0.010*"token" ',
  ' 0.010*"ethereum" ',
  ' 0.009*"buy" ',
  ' 0.009*"network" ',
  ' 0.009*"price" ',
  ' 0.009*"cryptocurrency" ',
  ' 0.009*"product" ',
  ' 0.009*"process" ',
  ' 0.009*"different" ',
  ' 0.009*"multiple" ',
  ' 0.009*"copy" ',
  ' 0.009*"datum" ',
  ' 0.009*"node"'],
 ['0.018*"blockchain" ',
  ' 0.018*"network" ',
  ' 0.014*"transaction" ',
  ' 0.014*"user" ',
  ' 0.011*"cryptocurrency" ',
  ' 0.011*"activity" ',
  ' 0.011*"allow" ',
  ' 0.011*"chain" ',
  ' 0.011*"term" ',
  ' 0.011*"crypto" ',
  ' 0.011*"slang" ',
  ' 0.011*"accessible" ',
  ' 0.008*"asset" ',
  ' 0.007*"block" ',
  ' 0.007*"product" ',
  ' 0.007*"service" ',
  ' 0.007*"include" ',
  ' 0.007*"hold" ',
  ' 0.007*"investor" ',
  ' 0.007*"wallet"'],
 ['0.020*"blockchain" ',
  ' 0.015*"bitcoin" ',
  ' 0.013*"block" ',
  ' 0.011*"exchange" ',
  ' 0.011*"

In [366]:
topics2 = []
for tapic in topics1:
    for tap in tapic:
        topics2.append(tap.split('*'))
topics2

[['0.017', '"like" '],
 [' 0.015', '"trade" '],
 [' 0.012', '"system" '],
 [' 0.012', '"create" '],
 [' 0.011', '"value" '],
 [' 0.010', '"sell" '],
 [' 0.010', '"hold" '],
 [' 0.010', '"token" '],
 [' 0.010', '"ethereum" '],
 [' 0.009', '"buy" '],
 [' 0.009', '"network" '],
 [' 0.009', '"price" '],
 [' 0.009', '"cryptocurrency" '],
 [' 0.009', '"product" '],
 [' 0.009', '"process" '],
 [' 0.009', '"different" '],
 [' 0.009', '"multiple" '],
 [' 0.009', '"copy" '],
 [' 0.009', '"datum" '],
 [' 0.009', '"node"'],
 ['0.018', '"blockchain" '],
 [' 0.018', '"network" '],
 [' 0.014', '"transaction" '],
 [' 0.014', '"user" '],
 [' 0.011', '"cryptocurrency" '],
 [' 0.011', '"activity" '],
 [' 0.011', '"allow" '],
 [' 0.011', '"chain" '],
 [' 0.011', '"term" '],
 [' 0.011', '"crypto" '],
 [' 0.011', '"slang" '],
 [' 0.011', '"accessible" '],
 [' 0.008', '"asset" '],
 [' 0.007', '"block" '],
 [' 0.007', '"product" '],
 [' 0.007', '"service" '],
 [' 0.007', '"include" '],
 [' 0.007', '"hold" '],

In [367]:
tops = []

for tip in topics2:   
    tops.append(tip[1][1:-2])
tops

['like',
 'trade',
 'system',
 'create',
 'value',
 'sell',
 'hold',
 'token',
 'ethereum',
 'buy',
 'network',
 'price',
 'cryptocurrency',
 'product',
 'process',
 'different',
 'multiple',
 'copy',
 'datum',
 'nod',
 'blockchain',
 'network',
 'transaction',
 'user',
 'cryptocurrency',
 'activity',
 'allow',
 'chain',
 'term',
 'crypto',
 'slang',
 'accessible',
 'asset',
 'block',
 'product',
 'service',
 'include',
 'hold',
 'investor',
 'walle',
 'blockchain',
 'bitcoin',
 'block',
 'exchange',
 'system',
 'hash',
 'process',
 'use',
 'network',
 'application',
 'node',
 'identity',
 'cryptographic',
 'coin',
 'poa',
 'label',
 'receive',
 'form',
 'project',
 'decisio',
 'go',
 'price',
 'system',
 'cryptocurrency',
 'asset',
 'particular',
 'strategy',
 'long',
 'short',
 'coin',
 'exchange',
 'project',
 'organization',
 'initial',
 'offering',
 'supply',
 'decentralized',
 'drive',
 'large',
 'deman',
 'transaction',
 'smart',
 'contract',
 'support',
 'function',
 'wallet',


### Final Bag of Words

In [378]:
tops

['like',
 'trade',
 'system',
 'create',
 'value',
 'sell',
 'hold',
 'token',
 'ethereum',
 'buy',
 'network',
 'price',
 'cryptocurrency',
 'product',
 'process',
 'different',
 'multiple',
 'copy',
 'datum',
 'nod',
 'blockchain',
 'network',
 'transaction',
 'user',
 'cryptocurrency',
 'activity',
 'allow',
 'chain',
 'term',
 'crypto',
 'slang',
 'accessible',
 'asset',
 'block',
 'product',
 'service',
 'include',
 'hold',
 'investor',
 'walle',
 'blockchain',
 'bitcoin',
 'block',
 'exchange',
 'system',
 'hash',
 'process',
 'use',
 'network',
 'application',
 'node',
 'identity',
 'cryptographic',
 'coin',
 'poa',
 'label',
 'receive',
 'form',
 'project',
 'decisio',
 'go',
 'price',
 'system',
 'cryptocurrency',
 'asset',
 'particular',
 'strategy',
 'long',
 'short',
 'coin',
 'exchange',
 'project',
 'organization',
 'initial',
 'offering',
 'supply',
 'decentralized',
 'drive',
 'large',
 'deman',
 'transaction',
 'smart',
 'contract',
 'support',
 'function',
 'wallet',


### User Defined function for getting cosine similarity score and jaccards similarity score a custom similarity and accuracy score

In [369]:
def cosine(rvector , c , l1 , l2):
    for i in range(len(rvector)):
        c+= l1[i]*l2[i]
    # if c is 0 it means that the there isnt any similarity and we return it as 0 as we could division by zero error
    if int(c) == 0:
        return 0
    return c / float((sum(l1)*sum(l2))**0.5)

In [370]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [371]:
# Custom function where we check how many words are actually similar between both the current sentence and BagOfWords
def similarity_score(sent , tops):
    sims = 0
    for s in sent:
        for t in tops:
            if s == t:
                sims += 1
    return float(format(sims / len(tops), '.7f'))   # 

In [372]:
def accuracy_score():
    accuracy = 0
    for idx in range(len(testData)):
        if testData['labels'][idx] == testData['predict'][idx]:
            accuracy += 1
    accuracy
    return accuracy / len(testData)

### Generate a Test dataset from multiple datasets

In [373]:
# Generate various datasets to test against
crypto = pd.read_csv('crypto_results.csv')

In [346]:
# We know the crypto dataset contains sentences related to blockchain , crypto and web3
data6 = pd.DataFrame()
data6['sentence'] = crypto['sentence'][:500]
data6

Unnamed: 0,sentence
0,welcome to this site was founded in may 2013 ...
1,here at we work very hard to ensure that all ...
2,each of our coin data pages has a graph that s...
3,we receive updated prices directly from many ...
4,related links new to crypto learn how to buy b...
...,...
495,the review will look into whether regulation o...
496,the key to the long term economic plan is cem...
497,osbornes follows hmrcs decision in march to s...
498,unlike currencies like the pound or dollar vi...


In [294]:
# Create a Final Test Data
testData = pd.DataFrame()
testData = testData.append(data6 , ignore_index = True)
testData['labels'] = "Similar"
testData

Unnamed: 0,sentence,labels
0,welcome to this site was founded in may 2013 ...,Similar
1,here at we work very hard to ensure that all ...,Similar
2,each of our coin data pages has a graph that s...,Similar
3,we receive updated prices directly from many ...,Similar
4,related links new to crypto learn how to buy b...,Similar
...,...,...
495,the review will look into whether regulation o...,Similar
496,the key to the long term economic plan is cem...,Similar
497,osbornes follows hmrcs decision in march to s...,Similar
498,unlike currencies like the pound or dollar vi...,Similar


In [295]:
# Provide a default for the predict column
testData['predict'] = '.'

### Label the predict column based on the similarity score

In [285]:
for idx in range(len(testData)):
    line = testData['sentence'][idx]
    sent = (word_tokenize(line))

    sent = [w for w in sent if not w.lower() in stopwords.words('english')]

    # sw contains the list of stopwords
    l1 =[];l2 =[]

    # remove stop words from the string
    X_set = {w for w in sent} 
    Y_set = {w for w in tops}

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) 
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
            
    c = 0
    
    threshold = 0.001
    similarity = cosine(rvector , c , l1 , l2)
    
    if similarity >= threshold:
        testData['predict'][idx] = 'Similar'
    else:
        testData['predict'][idx] = 'Disimilar'

### Count the number of accurate predictions

In [291]:
# Using Cosine formula
print("Cosine similarity : " , accuracy_score())

Cosine similarity :  0.876


In [287]:
# Create a Final Test Data
testData = pd.DataFrame()
testData = testData.append(data6 , ignore_index = True)
testData['labels'] = "Similar"
testData

Unnamed: 0,sentence,labels
0,welcome to this site was founded in may 2013 ...,Similar
1,here at we work very hard to ensure that all ...,Similar
2,each of our coin data pages has a graph that s...,Similar
3,we receive updated prices directly from many ...,Similar
4,related links new to crypto learn how to buy b...,Similar
...,...,...
495,the review will look into whether regulation o...,Similar
496,the key to the long term economic plan is cem...,Similar
497,osbornes follows hmrcs decision in march to s...,Similar
498,unlike currencies like the pound or dollar vi...,Similar


In [321]:
# Provide a default for the predict column
testData['predict'] = '.'

In [322]:
for idx in range(len(testData)):
    line = testData['sentence'][idx]
    sent = (word_tokenize(line))

    sent = [w for w in sent if not w.lower() in stopwords.words('english')]

    # sw contains the list of stopwords
    l1 =[];l2 =[]

    # remove stop words from the string
    X_set = {w for w in sent} 
    Y_set = {w for w in tops}

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) 
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
            
    c = 0
    
    threshold = 0.0001
    similarity = jaccard(sent , tops)
#    similarity = cosine(rvector , c , l1 , l2)
#    print(similarity , threshold)
    
    if similarity >= threshold:
        testData['predict'][idx] = 'Similar'
    else:
        testData['predict'][idx] = 'Disimilar'

In [323]:
# Using Jaccards formula
print("Jaccards similarity : " , accuracy_score())

Jaccards similarity :  0.79


In [374]:
# Provide a default for the predict column
testData['predict'] = '.'

In [375]:
for idx in range(len(testData)):
    line = testData['sentence'][idx]
    sent = (word_tokenize(line))

    sent = [w for w in sent if not w.lower() in stopwords.words('english')]

    # sw contains the list of stopwords
    l1 =[];l2 =[]

    # remove stop words from the string
    X_set = {w for w in sent} 
    Y_set = {w for w in tops}

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) 
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
            
    c = 0
    
    threshold = 0.00001
#     similarity = similarity_score(sent , tops)
    similarity = jaccard(sent , tops)
#    similarity = cosine(rvector , c , l1 , l2)
#    print(similarity , threshold)
    
    if similarity >= threshold:
        testData['predict'][idx] = 'Similar'
    else:
        testData['predict'][idx] = 'Disimilar'

In [376]:
# Using Custom formula
print("Custom word similarity : " , accuracy_score())

Custom word similarity :  0.844
