# Address Matching with Cosine Similarity

In [1]:
#import libraries
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import string
import re
import nltk
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #cv not used in this analysis
from sklearn.metrics.pairwise import cosine_similarity, csr_matrix
from scipy import sparse
import numpy_indexed as npi
from scipy.sparse import coo_matrix

In [2]:
#read in data
df = pd.read_csv('Data/addresses.csv')
df

Unnamed: 0,address,city,state,zip
0,112 W FOSTER AVE,STATE COLLEGE,PA,16801
1,412 South Trenton Street,Ruston,LA,71270
2,3580 PIEDMONT RD NE STE 207,ATLANTA,GA,30305
3,695 Henderson Dr,Cartersville,GA,30120
4,18221 Torrence Ave Suite 1b,Lansing,IL,60438
...,...,...,...,...
99558,303 MARKET DR,EMPORIA,VA,23847
99559,2685 Armstrong Rd,Wooster,OH,44691
99560,5513 Chamblee Dunwoody Rd,Dunwoody,GA,30338
99561,32 Fruit St,Boston,MA,2115


In [3]:
#inspect nulls/data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99563 entries, 0 to 99562
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   address  99563 non-null  object
 1   city     99563 non-null  object
 2   state    99563 non-null  object
 3   zip      99563 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 3.0+ MB


In [4]:
punct = set(string.punctuation)

#define function to clean text
def clean_text(text):
    # remove punctuations and convert characters to lower case
    text_nopunct = "".join([char.lower() for char in text if char not in punct]) 
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_no_doublespace

#columns to clean
cols = ['address', 'city', 'state']

#clean text
for col in cols:
    df[col] = df[col].apply(clean_text)

#concat entire address string for comparison
cols.append('zip')
df['full_concat'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1).str.lower()
df

Unnamed: 0,address,city,state,zip,full_concat
0,112 w foster ave,state college,pa,16801,112 w foster ave state college pa 16801
1,412 south trenton street,ruston,la,71270,412 south trenton street ruston la 71270
2,3580 piedmont rd ne ste 207,atlanta,ga,30305,3580 piedmont rd ne ste 207 atlanta ga 30305
3,695 henderson dr,cartersville,ga,30120,695 henderson dr cartersville ga 30120
4,18221 torrence ave suite 1b,lansing,il,60438,18221 torrence ave suite 1b lansing il 60438
...,...,...,...,...,...
99558,303 market dr,emporia,va,23847,303 market dr emporia va 23847
99559,2685 armstrong rd,wooster,oh,44691,2685 armstrong rd wooster oh 44691
99560,5513 chamblee dunwoody rd,dunwoody,ga,30338,5513 chamblee dunwoody rd dunwoody ga 30338
99561,32 fruit st,boston,ma,2115,32 fruit st boston ma 2115


In [5]:
#this step is only needed if a vectorizer requires tokenization. this column is not used for this analysis
df['tokenized'] = df['full_concat'].apply(nltk.word_tokenize)
df

Unnamed: 0,address,city,state,zip,full_concat,tokenized
0,112 w foster ave,state college,pa,16801,112 w foster ave state college pa 16801,"[112, w, foster, ave, state, college, pa, 16801]"
1,412 south trenton street,ruston,la,71270,412 south trenton street ruston la 71270,"[412, south, trenton, street, ruston, la, 71270]"
2,3580 piedmont rd ne ste 207,atlanta,ga,30305,3580 piedmont rd ne ste 207 atlanta ga 30305,"[3580, piedmont, rd, ne, ste, 207, atlanta, ga..."
3,695 henderson dr,cartersville,ga,30120,695 henderson dr cartersville ga 30120,"[695, henderson, dr, cartersville, ga, 30120]"
4,18221 torrence ave suite 1b,lansing,il,60438,18221 torrence ave suite 1b lansing il 60438,"[18221, torrence, ave, suite, 1b, lansing, il,..."
...,...,...,...,...,...,...
99558,303 market dr,emporia,va,23847,303 market dr emporia va 23847,"[303, market, dr, emporia, va, 23847]"
99559,2685 armstrong rd,wooster,oh,44691,2685 armstrong rd wooster oh 44691,"[2685, armstrong, rd, wooster, oh, 44691]"
99560,5513 chamblee dunwoody rd,dunwoody,ga,30338,5513 chamblee dunwoody rd dunwoody ga 30338,"[5513, chamblee, dunwoody, rd, dunwoody, ga, 3..."
99561,32 fruit st,boston,ma,2115,32 fruit st boston ma 2115,"[32, fruit, st, boston, ma, 2115]"


In [6]:
#remove duplicates
df.drop_duplicates(subset=['full_concat'], inplace=True, ignore_index=True)
df

Unnamed: 0,address,city,state,zip,full_concat,tokenized
0,112 w foster ave,state college,pa,16801,112 w foster ave state college pa 16801,"[112, w, foster, ave, state, college, pa, 16801]"
1,412 south trenton street,ruston,la,71270,412 south trenton street ruston la 71270,"[412, south, trenton, street, ruston, la, 71270]"
2,3580 piedmont rd ne ste 207,atlanta,ga,30305,3580 piedmont rd ne ste 207 atlanta ga 30305,"[3580, piedmont, rd, ne, ste, 207, atlanta, ga..."
3,695 henderson dr,cartersville,ga,30120,695 henderson dr cartersville ga 30120,"[695, henderson, dr, cartersville, ga, 30120]"
4,18221 torrence ave suite 1b,lansing,il,60438,18221 torrence ave suite 1b lansing il 60438,"[18221, torrence, ave, suite, 1b, lansing, il,..."
...,...,...,...,...,...,...
95498,303 market dr,emporia,va,23847,303 market dr emporia va 23847,"[303, market, dr, emporia, va, 23847]"
95499,2685 armstrong rd,wooster,oh,44691,2685 armstrong rd wooster oh 44691,"[2685, armstrong, rd, wooster, oh, 44691]"
95500,5513 chamblee dunwoody rd,dunwoody,ga,30338,5513 chamblee dunwoody rd dunwoody ga 30338,"[5513, chamblee, dunwoody, rd, dunwoody, ga, 3..."
95501,32 fruit st,boston,ma,2115,32 fruit st boston ma 2115,"[32, fruit, st, boston, ma, 2115]"


In [7]:
#define ngrams = 3
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

#limit DF to 10,000 rows (RAM limitation)
df = df.iloc[0:10_000]
tv = TfidfVectorizer(min_df=1, analyzer=ngrams)
tfidf_matrix = tv.fit_transform(df['full_concat'])
print(tfidf_matrix.shape)

(10000, 7145)


In [8]:
#convert tfidf matrix into a sparse matrix
A_sparse = sparse.csr_matrix(tfidf_matrix)

#calculate cosine similarity
similarities_sparse = cosine_similarity(A_sparse,dense_output=False)

In [9]:
#preview results in df
df_temp = pd.DataFrame.sparse.from_spmatrix(similarities_sparse)
df_temp.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.0,0.034764,0.034214,0.029541,0.043288,0.058533,0.013731,0.037343,0.0,0.014235,...,0.05031,0.0,0.0,0.0,0.073283,0.0,0.037592,0.0,0.0,0.020634
1,0.034764,1.0,0.028087,0.016039,0.036426,0.004378,0.0,0.098022,0.0,0.0,...,0.012628,0.08492,0.0,0.0,0.019544,0.009761,0.0,0.0,0.0,0.010233
2,0.034214,0.028087,1.0,0.054055,0.028168,0.061522,0.013789,0.022311,0.0,0.012063,...,0.0,0.014021,0.0,0.0,0.02283,0.012744,0.016438,0.101189,0.0,0.01336
3,0.029541,0.016039,0.054055,1.0,0.0,0.038883,0.0,0.010806,0.0,0.0,...,0.0,0.024321,0.0,0.0,0.0,0.01419,0.0,0.0,0.010329,0.0
4,0.043288,0.036426,0.028168,0.0,1.0,0.007107,0.0,0.035176,0.0,0.03535,...,0.024016,0.0,0.0,0.0,0.019204,0.008077,0.020743,0.0,0.0,0.008467
5,0.058533,0.004378,0.061522,0.038883,0.007107,1.0,0.0,0.009946,0.0,0.062308,...,0.0,0.075218,0.040174,0.0,0.144942,0.0,0.0,0.018191,0.04127,0.0
6,0.013731,0.0,0.013789,0.0,0.0,0.0,1.0,0.0,0.140727,0.013911,...,0.0,0.0,0.058126,0.0,0.012268,0.014697,0.134258,0.017648,0.0,0.015407
7,0.037343,0.098022,0.022311,0.010806,0.035176,0.009946,0.0,1.0,0.0,0.0,...,0.019298,0.0,0.0,0.04815,0.027612,0.065606,0.052095,0.0,0.012502,0.008835
8,0.0,0.0,0.0,0.0,0.0,0.0,0.140727,0.0,1.0,0.0,...,0.0,0.0,0.056322,0.0,0.0,0.0,0.094435,0.0,0.0,0.049571
9,0.014235,0.0,0.012063,0.0,0.03535,0.062308,0.013911,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.060794,0.0362,0.0,0.015439,0.0,0.013478


In [10]:
similarities_sparse.setdiag(0)
df_temp = pd.DataFrame.sparse.from_spmatrix(similarities_sparse)
df_temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.000000,0.034764,0.034214,0.029541,0.043288,0.058533,0.013731,0.037343,0.000000,0.014235,...,0.050310,0.000000,0.000000,0.000000,0.073283,0.000000,0.037592,0.000000,0.000000,0.020634
1,0.034764,0.000000,0.028087,0.016039,0.036426,0.004378,0.000000,0.098022,0.000000,0.000000,...,0.012628,0.084920,0.000000,0.000000,0.019544,0.009761,0.000000,0.000000,0.000000,0.010233
2,0.034214,0.028087,0.000000,0.054055,0.028168,0.061522,0.013789,0.022311,0.000000,0.012063,...,0.000000,0.014021,0.000000,0.000000,0.022830,0.012744,0.016438,0.101189,0.000000,0.013360
3,0.029541,0.016039,0.054055,0.000000,0.000000,0.038883,0.000000,0.010806,0.000000,0.000000,...,0.000000,0.024321,0.000000,0.000000,0.000000,0.014190,0.000000,0.000000,0.010329,0.000000
4,0.043288,0.036426,0.028168,0.000000,0.000000,0.007107,0.000000,0.035176,0.000000,0.035350,...,0.024016,0.000000,0.000000,0.000000,0.019204,0.008077,0.020743,0.000000,0.000000,0.008467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.009761,0.012744,0.014190,0.008077,0.000000,0.014697,0.065606,0.000000,0.036200,...,0.000000,0.000000,0.013537,0.029847,0.019659,0.000000,0.000000,0.091338,0.000000,0.038544
9996,0.037592,0.000000,0.016438,0.000000,0.020743,0.000000,0.134258,0.052095,0.094435,0.000000,...,0.081889,0.000000,0.025216,0.049675,0.000000,0.000000,0.000000,0.000000,0.031974,0.000000
9997,0.000000,0.000000,0.101189,0.000000,0.000000,0.018191,0.017648,0.000000,0.000000,0.015439,...,0.063685,0.020608,0.000000,0.018480,0.000000,0.091338,0.000000,0.000000,0.000000,0.017099
9998,0.000000,0.000000,0.000000,0.010329,0.000000,0.041270,0.000000,0.012502,0.000000,0.000000,...,0.000000,0.014574,0.000000,0.013849,0.000000,0.000000,0.031974,0.000000,0.000000,0.000000


In [11]:
#let's put our  cosine similarity matrix into a coordinate matrix to access index values to isolate matching addresses
similarities_sparse = coo_matrix(similarities_sparse)
col, argmax = npi.group_by(similarities_sparse.col).argmax(similarities_sparse.data)
row = similarities_sparse.row[argmax]

In [12]:
row, col

(array([3805, 7854, 8935, ..., 6830, 7318,  453], dtype=int32),
 array([   0,    1,    2, ..., 9997, 9998, 9999], dtype=int32))

In [13]:
highest_similarity_score_list = df_temp.lookup(row, col)
corresponding_address_list = [df['full_concat'].iloc[row[x]] for x in range(len(df))]

In [14]:
df['highest_cosine_similarity'] = highest_similarity_score_list
df['original_address'] = df['full_concat']
df['comparison_address'] = corresponding_address_list
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['highest_cosine_similarity'] = highest_similarity_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['original_address'] = df['full_concat']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['comparison_address'] = corresponding_address_list


Unnamed: 0,address,city,state,zip,full_concat,tokenized,highest_cosine_similarity,original_address,comparison_address
0,112 w foster ave,state college,pa,16801,112 w foster ave state college pa 16801,"[112, w, foster, ave, state, college, pa, 16801]",0.334644,112 w foster ave state college pa 16801,2 college ave mountville pa 17554
1,412 south trenton street,ruston,la,71270,412 south trenton street ruston la 71270,"[412, south, trenton, street, ruston, la, 71270]",0.452594,412 south trenton street ruston la 71270,3481 hwy 33 ruston la 71270
2,3580 piedmont rd ne ste 207,atlanta,ga,30305,3580 piedmont rd ne ste 207 atlanta ga 30305,"[3580, piedmont, rd, ne, ste, 207, atlanta, ga...",0.753691,3580 piedmont rd ne ste 207 atlanta ga 30305,3525 piedmont rd ne atlanta ga 30305
3,695 henderson dr,cartersville,ga,30120,695 henderson dr cartersville ga 30120,"[695, henderson, dr, cartersville, ga, 30120]",0.582179,695 henderson dr cartersville ga 30120,9 medical dr cartersville ga 30121
4,18221 torrence ave suite 1b,lansing,il,60438,18221 torrence ave suite 1b lansing il 60438,"[18221, torrence, ave, suite, 1b, lansing, il,...",0.633644,18221 torrence ave suite 1b lansing il 60438,17653 torrence ave lansing il 60438
...,...,...,...,...,...,...,...,...,...
9995,2141 cass lake rd,keego harbor,mi,48320,2141 cass lake rd keego harbor mi 48320,"[2141, cass, lake, rd, keego, harbor, mi, 48320]",0.652933,2141 cass lake rd keego harbor mi 48320,3435 orchard lake rd ste a keego harbor mi 48320
9996,163 w high ave,new philadelphia,oh,44663,163 w high ave new philadelphia oh 44663,"[163, w, high, ave, new, philadelphia, oh, 44663]",0.867525,163 w high ave new philadelphia oh 44663,2300 e high ave new philadelphia oh 44663
9997,33030 van born rd,wayne,mi,48184,33030 van born rd wayne mi 48184,"[33030, van, born, rd, wayne, mi, 48184]",0.576951,33030 van born rd wayne mi 48184,3106 s wayne rd wayne mi 48184
9998,3940 oakwood hills pkwy,eau claire,wi,54701,3940 oakwood hills pkwy eau claire wi 54701,"[3940, oakwood, hills, pkwy, eau, claire, wi, ...",0.775483,3940 oakwood hills pkwy eau claire wi 54701,3814 oakwood hills parkway eau claire wi 54701


In [15]:
df_threshold = df[df['highest_cosine_similarity'] > .95]
df_threshold

Unnamed: 0,address,city,state,zip,full_concat,tokenized,highest_cosine_similarity,original_address,comparison_address
33,21 montvale ave,stoneham,ma,2180,21 montvale ave stoneham ma 2180,"[21, montvale, ave, stoneham, ma, 2180]",0.985545,21 montvale ave stoneham ma 2180,1 montvale ave stoneham ma 2180
179,1820 franklin st,toronto,oh,43964,1820 franklin st toronto oh 43964,"[1820, franklin, st, toronto, oh, 43964]",0.953593,1820 franklin st toronto oh 43964,1820 franklin street toronto oh 43964
603,1229 garrisonville rd ste 101,stafford,va,22556,1229 garrisonville rd ste 101 stafford va 22556,"[1229, garrisonville, rd, ste, 101, stafford, ...",0.950056,1229 garrisonville rd ste 101 stafford va 22556,1229 garrisonville rd stafford va 22556
680,9069 w olive,peoria,az,85345,9069 w olive peoria az 85345,"[9069, w, olive, peoria, az, 85345]",0.975804,9069 w olive peoria az 85345,9069 w olive ave peoria az 85345
681,1453 east bert kouns industrial loop,shreveport,la,71105,1453 east bert kouns industrial loop shrevepor...,"[1453, east, bert, kouns, industrial, loop, sh...",0.962132,1453 east bert kouns industrial loop shrevepor...,1453 e bert kouns industrial loop shreveport l...
...,...,...,...,...,...,...,...,...,...
9642,123 hospital dr suite 2004,watertown,wi,53098,123 hospital dr suite 2004 watertown wi 53098,"[123, hospital, dr, suite, 2004, watertown, wi...",0.950648,123 hospital dr suite 2004 watertown wi 53098,123 hospital drive suite 2004 watertown wi 53098
9658,1190 prairie st,prairie du sac,wi,53578,1190 prairie st prairie du sac wi 53578,"[1190, prairie, st, prairie, du, sac, wi, 53578]",0.972526,1190 prairie st prairie du sac wi 53578,1190 prairie street prairie du sac wi 53578
9759,600 n koeller st,oshkosh,wi,54902,600 n koeller st oshkosh wi 54902,"[600, n, koeller, st, oshkosh, wi, 54902]",0.960221,600 n koeller st oshkosh wi 54902,480 n koeller st oshkosh wi 54902
9785,20 bristol rd,damariscotta,me,4543,20 bristol rd damariscotta me 4543,"[20, bristol, rd, damariscotta, me, 4543]",0.980317,20 bristol rd damariscotta me 4543,10 bristol rd damariscotta me 4543


In [16]:
df_threshold.sort_values(by=['highest_cosine_similarity'])

Unnamed: 0,address,city,state,zip,full_concat,tokenized,highest_cosine_similarity,original_address,comparison_address
603,1229 garrisonville rd ste 101,stafford,va,22556,1229 garrisonville rd ste 101 stafford va 22556,"[1229, garrisonville, rd, ste, 101, stafford, ...",0.950056,1229 garrisonville rd ste 101 stafford va 22556,1229 garrisonville rd stafford va 22556
7077,1229 garrisonville rd,stafford,va,22556,1229 garrisonville rd stafford va 22556,"[1229, garrisonville, rd, stafford, va, 22556]",0.950056,1229 garrisonville rd stafford va 22556,1229 garrisonville rd ste 101 stafford va 22556
9642,123 hospital dr suite 2004,watertown,wi,53098,123 hospital dr suite 2004 watertown wi 53098,"[123, hospital, dr, suite, 2004, watertown, wi...",0.950648,123 hospital dr suite 2004 watertown wi 53098,123 hospital drive suite 2004 watertown wi 53098
938,123 hospital drive suite 2004,watertown,wi,53098,123 hospital drive suite 2004 watertown wi 53098,"[123, hospital, drive, suite, 2004, watertown,...",0.950648,123 hospital drive suite 2004 watertown wi 53098,123 hospital dr suite 2004 watertown wi 53098
4978,10101 w greenfield ave,west allis,wi,53214,10101 w greenfield ave west allis wi 53214,"[10101, w, greenfield, ave, west, allis, wi, 5...",0.950698,10101 w greenfield ave west allis wi 53214,6101 w greenfield ave west allis wi 53214
...,...,...,...,...,...,...,...,...,...
2372,2800 route 130 n,cinnaminson,nj,8077,2800 route 130 n cinnaminson nj 8077,"[2800, route, 130, n, cinnaminson, nj, 8077]",0.979745,2800 route 130 n cinnaminson nj 8077,2800 route 130 n cinnaminson nj 80773
931,10 bristol rd,damariscotta,me,4543,10 bristol rd damariscotta me 4543,"[10, bristol, rd, damariscotta, me, 4543]",0.980317,10 bristol rd damariscotta me 4543,20 bristol rd damariscotta me 4543
9785,20 bristol rd,damariscotta,me,4543,20 bristol rd damariscotta me 4543,"[20, bristol, rd, damariscotta, me, 4543]",0.980317,20 bristol rd damariscotta me 4543,10 bristol rd damariscotta me 4543
8069,1 montvale ave,stoneham,ma,2180,1 montvale ave stoneham ma 2180,"[1, montvale, ave, stoneham, ma, 2180]",0.985545,1 montvale ave stoneham ma 2180,21 montvale ave stoneham ma 2180


In [17]:
df_threshold.sort_values(by=['highest_cosine_similarity'])

Unnamed: 0,address,city,state,zip,full_concat,tokenized,highest_cosine_similarity,original_address,comparison_address
603,1229 garrisonville rd ste 101,stafford,va,22556,1229 garrisonville rd ste 101 stafford va 22556,"[1229, garrisonville, rd, ste, 101, stafford, ...",0.950056,1229 garrisonville rd ste 101 stafford va 22556,1229 garrisonville rd stafford va 22556
7077,1229 garrisonville rd,stafford,va,22556,1229 garrisonville rd stafford va 22556,"[1229, garrisonville, rd, stafford, va, 22556]",0.950056,1229 garrisonville rd stafford va 22556,1229 garrisonville rd ste 101 stafford va 22556
9642,123 hospital dr suite 2004,watertown,wi,53098,123 hospital dr suite 2004 watertown wi 53098,"[123, hospital, dr, suite, 2004, watertown, wi...",0.950648,123 hospital dr suite 2004 watertown wi 53098,123 hospital drive suite 2004 watertown wi 53098
938,123 hospital drive suite 2004,watertown,wi,53098,123 hospital drive suite 2004 watertown wi 53098,"[123, hospital, drive, suite, 2004, watertown,...",0.950648,123 hospital drive suite 2004 watertown wi 53098,123 hospital dr suite 2004 watertown wi 53098
4978,10101 w greenfield ave,west allis,wi,53214,10101 w greenfield ave west allis wi 53214,"[10101, w, greenfield, ave, west, allis, wi, 5...",0.950698,10101 w greenfield ave west allis wi 53214,6101 w greenfield ave west allis wi 53214
...,...,...,...,...,...,...,...,...,...
2372,2800 route 130 n,cinnaminson,nj,8077,2800 route 130 n cinnaminson nj 8077,"[2800, route, 130, n, cinnaminson, nj, 8077]",0.979745,2800 route 130 n cinnaminson nj 8077,2800 route 130 n cinnaminson nj 80773
931,10 bristol rd,damariscotta,me,4543,10 bristol rd damariscotta me 4543,"[10, bristol, rd, damariscotta, me, 4543]",0.980317,10 bristol rd damariscotta me 4543,20 bristol rd damariscotta me 4543
9785,20 bristol rd,damariscotta,me,4543,20 bristol rd damariscotta me 4543,"[20, bristol, rd, damariscotta, me, 4543]",0.980317,20 bristol rd damariscotta me 4543,10 bristol rd damariscotta me 4543
8069,1 montvale ave,stoneham,ma,2180,1 montvale ave stoneham ma 2180,"[1, montvale, ave, stoneham, ma, 2180]",0.985545,1 montvale ave stoneham ma 2180,21 montvale ave stoneham ma 2180


In [18]:
# pd.set_option('display.max_columns')
pd.set_option('display.max_rows', 100)
x = df_threshold.drop(['full_concat', 'tokenized', 'original_address', 'comparison_address'], axis=1).sort_values(
    by=['highest_cosine_similarity'])
x.tail(10)

Unnamed: 0,address,city,state,zip,highest_cosine_similarity
5395,1100 sw saint lucie west blvd,port st lucie,fl,34986,0.975452
1125,1100 sw saint lucie west blvd,port saint lucie,fl,34986,0.975452
680,9069 w olive,peoria,az,85345,0.975804
4071,9069 w olive ave,peoria,az,85345,0.975804
755,2800 route 130 n,cinnaminson,nj,80773,0.979745
2372,2800 route 130 n,cinnaminson,nj,8077,0.979745
931,10 bristol rd,damariscotta,me,4543,0.980317
9785,20 bristol rd,damariscotta,me,4543,0.980317
8069,1 montvale ave,stoneham,ma,2180,0.985545
33,21 montvale ave,stoneham,ma,2180,0.985545


In [19]:
# cv = CountVectorizer()
# cv_matrix = cv.fit_transform(df['full_concat'])
# print(cv_matrix.shape)

In [20]:
# pip install sparse_dot_topn

In [21]:
# from scipy.sparse import csr_matrix
# import sparse_dot_topn.sparse_dot_topn as ct

# def awesome_cossim_top(A, B, ntop, lower_bound=0):
#     # force A and B as a CSR matrix.
#     # If they have already been CSR, there is no overhead
#     A = A.tocsr()
#     B = B.tocsr()
#     M, _ = A.shape
#     _, N = B.shape
 
#     idx_dtype = np.int32
 
#     nnz_max = M*ntop
 
#     indptr = np.zeros(M+1, dtype=idx_dtype)
#     indices = np.zeros(nnz_max, dtype=idx_dtype)
#     data = np.zeros(nnz_max, dtype=A.dtype)
    
# ct.sparse_dot_topn(
#         M, N, np.asarray(A.indptr, dtype=idx_dtype),
#         np.asarray(A.indices, dtype=idx_dtype),
#         A.data,
#         np.asarray(B.indptr, dtype=idx_dtype),
#         np.asarray(B.indices, dtype=idx_dtype),
#         B.data,
#         ntop,
#         lower_bound,
#         indptr, indices, data)

# return csr_matrix((data,indices,indptr),shape=(M,N))