In [1]:
import pandas as pd
import numpy as np
import re
import collections
import time
import recordlinkage
import jellyfish

In [4]:
# load openhouse
openhouse = pd.read_csv('open-houses-montreal.csv',encoding='utf-8')

# keep openhouse with muncipality id
openhouse = openhouse[~openhouse.muncipality_id.isna()]

# keep a copy of the original data
house_original = openhouse.copy()

# keep only the useful columns
openhouse = openhouse[['muncipality_id','category','listing_price','sales_type','address','longitude',
                       'latitude','year_built','retrieved_at']]

# force convert muncipality_id to be integer
openhouse.muncipality_id = openhouse.muncipality_id.astype('int')

# split the address
openhouse.address = openhouse.address.apply(lambda x: [i.replace(' - ','-').rstrip(' ') for i in x.split(', ')])

# generate a list of muncipality with their id

In [5]:
muncipality_name = dict()

for muncipality_id in np.unique(openhouse.muncipality_id):
    
    if len(openhouse[openhouse.muncipality_id==muncipality_id]) > 20:
    
        L = [j for i in openhouse[openhouse.muncipality_id==muncipality_id].address for j in i]

        c = collections.Counter(L)

        muncipality_name[muncipality_id] = c.most_common()[0][0]

# remove neighbourhood and muncipality name from address, if they exists

In [7]:
neighbourhood = [np.nan] * len(openhouse)
start = time.time()
for i in range(len(openhouse)):
        
    try:
        openhouse.iloc[i,4].remove(muncipality_name[openhouse.muncipality_id.iloc[i]])
    except:
        pass
    
    for j in openhouse.iloc[i,4]:
        if 'Neighbourhood' in j:
            neighbourhood[i] = j
            openhouse.iloc[i,4].remove(j)

openhouse['neighbourhood'] = neighbourhood

In [8]:
openhouse

Unnamed: 0,muncipality_id,category,listing_price,sales_type,address,longitude,latitude,year_built,retrieved_at,neighbourhood
0,830,House,1199000.0,sale,"[7, Place Raymond]",-73.892341,45.443783,1999.0,2018-07-25,Neighbourhood North West
1,815,Condo,174900.0,sale,"[6801, boulevard des Roseraies, apt. 303]",-73.555285,45.597026,1990.0,2018-07-25,
2,843,Condo,749000.0,sale,"[1210, boulevard De Maisonneuve Ouest, apt. 21A]",-73.575657,45.499809,2006.0,2018-07-25,Neighbourhood Golden Square Mile
3,836,Condo,289000.0,sale,"[120, Place Donnacona, apt. 302]",-73.809583,45.486850,2004.0,2018-07-25,Neighbourhood Central
4,845,Triplex,498000.0,sale,"[2555-2559, Rue Lyall]",-73.530962,45.583558,1984.0,2018-07-25,Neighbourhood Mercier
5,838,Condo,925000.0,sale,"[6107, Avenue Somerled]",-73.633904,45.472869,1937.0,2018-07-25,Neighbourhood Notre-Dame-de-Grâce
6,820,Condo,239000.0,sale,"[1910, boulevard Guy-Bouchard]",-73.609844,45.444632,1997.0,2018-07-25,
7,845,Condo,189000.0,sale,"[2560, Avenue Bennett, apt. 3]",-73.547969,45.557645,1992.0,2018-07-25,Neighbourhood Hochelaga-Maisonneuve
8,840,House,359000.0,sale,"[12364, Rue Olivier]",-73.732941,45.526124,1964.0,2018-07-25,Neighbourhood Bois de Saraguay
9,836,House,869000.0,sale,"[229, Rue Mirabel]",-73.842373,45.483073,1999.0,2018-07-25,Neighbourhood Central


In [9]:
len_address = openhouse.address.apply(lambda x: len(x))
collections.Counter(len_address)

Counter({1: 159, 2: 18693, 3: 16032, 4: 19, 5: 2})

# address lenth >3 are mostly un-popular munipality

In [11]:
for i in openhouse.address[len_address>3].index:
    print openhouse.loc[i].muncipality_id 
    print openhouse.loc[i].address

279
[u'2663', u'Chemin Sainte-Foy', u'apt. 108', u'Sainte-Foy/Sillery/Cap-Rouge (Qu\xe9bec)']
115
[u'1013', u'Rue du Parc', u'Rimouski', u'Quartier Pointe-au-P\xe8re']
276
[u'1195', u'Avenue Royale', u'Beauport (Qu\xe9bec)', u'Quartier Vieux-Bourg']
1264
[u'2021', u'Rue Saint-Jean-Baptiste', u'Jonqui\xe8re (Saguenay)', u'Quartier Jonqui\xe8re']
804
[u'587', u'Rue Guillemette', u'app. 86', u'Fabreville (Laval)', u'Quartier Est']
281
[u'15161518', u"Rue de l'Esplanade", u'La Haute-Saint-Charles (Qu\xe9bec)', u'Quartier Val-B\xe9lair']
1133
[u'1', u'Chemin du Lac-Gueguen', u"Val-d'Or", u'Quartier Ext\xe9rieur Est']
1085
[u'358A360', u'Rue Pauly', u'Rouyn-Noranda', u'Quartier Noranda-Nord']
463
[u'183185', u'Avenue Godefroy', u'B\xe9cancour', u'Quartier Saint-Gr\xe9goire']
843
[u'207', u'Rue de la Commune Ouest', u'apt. RDC', u'2,3']
276
[u'127', u'Rue Latouche', u'Beauport (Qu\xe9bec)', u'Quartier Chutes-Montmorency']
1013
[u'994', u'boulevard Maloney Est', u'Gatineau (Gatineau)', u'Quart

# address lenth == 1 are mostly lot, land, business or without listing price

In [12]:
short_add = openhouse[len_address==1]

In [13]:
short_add[(short_add.category!='Lot') & (short_add.category!='Land') & (short_add.listing_price>1)]

Unnamed: 0,muncipality_id,category,listing_price,sales_type,address,longitude,latitude,year_built,retrieved_at,neighbourhood
4745,842,Business,155000.0,sale,[Rue Saint-Denis],-73.569246,45.51752,,2018-06-07,Neighbourhood Le Plateau-Mont-Royal
25781,844,Business,150000.0,sale,[Rue Beaubien Est],-73.588341,45.552852,,2018-01-23,Neighbourhood Rosemont North
25932,846,Income properties,1550000.0,sale,[Rue Notre-Dame Est],-73.496554,45.683795,1958.0,2018-01-23,Neighbourhood Pointe-aux-Trembles
33420,838,Business,2400000.0,sale,[Rue Dalou],-73.625196,45.482885,,2018-01-22,Neighbourhood Côte-des-Neiges
33533,840,Business,750000.0,sale,[boulevard Gouin Ouest],-73.723304,45.529636,,2018-01-22,Neighbourhood Cartierville
34103,817,Business,365000.0,sale,[boulevard Henri-Bourassa Est],-73.648396,45.587735,,2018-01-22,


# address lenth == 3 and without apt. or suite are mostly un-popular munipality

In [14]:
openhouse[(len_address==3)][openhouse.address[(len_address==3)].apply(lambda x: len(set(['apt.','suite'])
                                                               .intersection(set(x[2].split(' '))))==0)]

Unnamed: 0,muncipality_id,category,listing_price,sales_type,address,longitude,latitude,year_built,retrieved_at,neighbourhood
2156,289,House,,sale,"[254, Rue de l'Acadie, Les Chutes-de-la-Chaudi...",-71.31103,46.70716,1978.0,2018-07-03,Neighbourhood Saint-Nicolas
2158,118,House,,sale,"[8, 6e Avenue, Saint-Fabien]",-68.868167,48.293496,0.0,2018-07-03,
2159,593,Lot,,sale,"[33, Carré George-Adams, Bromont]",-72.782772,45.276457,,2018-07-03,
2160,747,House,,sale,"[3775, Rue La Durantaye, Saint-Hubert (Longueu...",-73.358086,45.454009,2009.0,2018-07-03,Neighbourhood Le Boisé de Saint-Hubert
2161,799,House,,sale,"[2791, Rue Chauvette, Mascouche]",-73.606795,45.754472,1975.0,2018-07-03,Neighbourhood City
2162,281,House,,sale,"[1000, Rue du Cabestan, La Haute-Saint-Charles...",-71.426416,46.845477,2009.0,2018-07-03,Neighbourhood Val-Bélair
2163,928,House,,sale,"[2129, Terrasse Jourdain, Sainte-Sophie]",-73.8902,45.82122,1978.0,2018-07-03,
2164,948,House,,sale,"[2105, Chemin de la Baie-Noire, Wentworth-Nord]",-74.442791,45.760502,1954.0,2018-07-03,
2165,907,House,,sale,"[1240, Montée Noire, Sainte-Justine-de-Newton]",-74.399402,45.379873,2010.0,2018-07-03,
2166,673,House,,sale,"[4400, Chemin du Lac, Saint-Gabriel-de-Brandon]",-73.340222,46.262654,0.0,2018-07-03,


# process address lenth==3 records with apt. or suite. no.

In [10]:
openhouse_3 = openhouse[(len_address==3)][openhouse.address[(len_address==3)].apply(lambda x: len(set(['apt.','suite'])
                                                               .intersection(set(x[2].split(' '))))>0)]

In [11]:
openhouse_3['apt'] = openhouse_3.address.apply(lambda x: x[2].replace('apt.','').replace('suite','').replace(' ',''))

In [12]:
openhouse_3['No.'] = openhouse_3.address.apply(lambda x: x[0])

openhouse_3['street'] = openhouse_3.address.apply(lambda x: x[1])

In [13]:
openhouse_3

Unnamed: 0,muncipality_id,category,listing_price,sales_type,address,longitude,latitude,year_built,retrieved_at,neighbourhood,apt,No.,street
1,815,Condo,174900.0,sale,"[6801, boulevard des Roseraies, apt. 303]",-73.555285,45.597026,1990.0,2018-07-25,,303,6801,boulevard des Roseraies
2,843,Condo,749000.0,sale,"[1210, boulevard De Maisonneuve Ouest, apt. 21A]",-73.575657,45.499809,2006.0,2018-07-25,Neighbourhood Golden Square Mile,21A,1210,boulevard De Maisonneuve Ouest
3,836,Condo,289000.0,sale,"[120, Place Donnacona, apt. 302]",-73.809583,45.486850,2004.0,2018-07-25,Neighbourhood Central,302,120,Place Donnacona
7,845,Condo,189000.0,sale,"[2560, Avenue Bennett, apt. 3]",-73.547969,45.557645,1992.0,2018-07-25,Neighbourhood Hochelaga-Maisonneuve,3,2560,Avenue Bennett
10,822,Condo,350000.0,sale,"[5740, Avenue Rembrandt, apt. 602]",-73.659603,45.476015,1980.0,2018-07-25,,602,5740,Avenue Rembrandt
13,827,Condo,339000.0,sale,"[3175, Avenue Ernest-Hemingway, apt. 307]",-73.719273,45.506360,2007.0,2018-07-25,Neighbourhood New Saint-Laurent/Bois-Franc,307,3175,Avenue Ernest-Hemingway
14,843,Condo,1325000.0,sale,"[1455, Rue Sherbrooke Ouest, apt. 1601]",-73.580818,45.497798,1966.0,2018-07-25,Neighbourhood Golden Square Mile,1601,1455,Rue Sherbrooke Ouest
16,840,Condo,279000.0,sale,"[1475, Place de Louvain, apt. 6]",-73.643332,45.562717,1986.0,2018-07-25,Neighbourhood Ahuntsic West,6,1475,Place de Louvain
24,815,Condo,229500.0,sale,"[7200, Avenue M-B-Jodoin, apt. 807]",-73.583221,45.606265,1987.0,2018-07-25,,807,7200,Avenue M-B-Jodoin
25,843,Condo,1100000.0,sale,"[801, Rue de la Commune Est, apt. 804]",-73.549404,45.512143,2007.0,2018-07-25,Neighbourhood Old Montréal,804,801,Rue de la Commune Est


In [14]:
start = time.time()

pc = recordlinkage.BlockIndex(on=['muncipality_id','category','sales_type'])

pairs = pc.index(openhouse_3)

print 'Index takes:',time.time()-start

Index takes: 7.62789797783


In [20]:
len(pairs)

7356551

In [120]:
start = time.time()

pc1 = recordlinkage.BlockIndex(on=['muncipality_id'])

pairs = pc1.index(openhouse_3)

print 'Index takes:',time.time()-start

print len(pairs)

Index takes: 16.842028141
16202869


In [121]:
start = time.time()

pc2 = recordlinkage.BlockIndex(on=['muncipality_id','category'])

pairs = pc2.index(openhouse_3)

print 'Index takes:',time.time()-start

print len(pairs)

Index takes: 11.3599848747
7371599


In [137]:
compare_house = recordlinkage.Compare()

compare_house.string('street','street', method='jarowinkler', threshold=0.95, label='street')

compare_house.exact('apt','apt',label='apt')

compare_house.exact('No.','No.',label='No.')

<Compare>

In [15]:
openhouse_3

Unnamed: 0,muncipality_id,category,listing_price,sales_type,address,longitude,latitude,year_built,retrieved_at,neighbourhood,apt,No.,street
1,815,Condo,174900.0,sale,"[6801, boulevard des Roseraies, apt. 303]",-73.555285,45.597026,1990.0,2018-07-25,,303,6801,boulevard des Roseraies
2,843,Condo,749000.0,sale,"[1210, boulevard De Maisonneuve Ouest, apt. 21A]",-73.575657,45.499809,2006.0,2018-07-25,Neighbourhood Golden Square Mile,21A,1210,boulevard De Maisonneuve Ouest
3,836,Condo,289000.0,sale,"[120, Place Donnacona, apt. 302]",-73.809583,45.486850,2004.0,2018-07-25,Neighbourhood Central,302,120,Place Donnacona
7,845,Condo,189000.0,sale,"[2560, Avenue Bennett, apt. 3]",-73.547969,45.557645,1992.0,2018-07-25,Neighbourhood Hochelaga-Maisonneuve,3,2560,Avenue Bennett
10,822,Condo,350000.0,sale,"[5740, Avenue Rembrandt, apt. 602]",-73.659603,45.476015,1980.0,2018-07-25,,602,5740,Avenue Rembrandt
13,827,Condo,339000.0,sale,"[3175, Avenue Ernest-Hemingway, apt. 307]",-73.719273,45.506360,2007.0,2018-07-25,Neighbourhood New Saint-Laurent/Bois-Franc,307,3175,Avenue Ernest-Hemingway
14,843,Condo,1325000.0,sale,"[1455, Rue Sherbrooke Ouest, apt. 1601]",-73.580818,45.497798,1966.0,2018-07-25,Neighbourhood Golden Square Mile,1601,1455,Rue Sherbrooke Ouest
16,840,Condo,279000.0,sale,"[1475, Place de Louvain, apt. 6]",-73.643332,45.562717,1986.0,2018-07-25,Neighbourhood Ahuntsic West,6,1475,Place de Louvain
24,815,Condo,229500.0,sale,"[7200, Avenue M-B-Jodoin, apt. 807]",-73.583221,45.606265,1987.0,2018-07-25,,807,7200,Avenue M-B-Jodoin
25,843,Condo,1100000.0,sale,"[801, Rue de la Commune Est, apt. 804]",-73.549404,45.512143,2007.0,2018-07-25,Neighbourhood Old Montréal,804,801,Rue de la Commune Est


In [140]:
start = time.time()

features = compare_house.compute(pairs, openhouse_3, openhouse_3)

print 'Take',time.time()-start

Take 95.2610638142


In [141]:
indexs = features[features.sum(axis=1)>2]

In [142]:
indexs

Unnamed: 0,Unnamed: 1,street,apt,No.
362,5766,1.0,1,1
2287,14593,1.0,1,1
7652,8373,1.0,1,1
917,15794,1.0,1,1
221,11832,1.0,1,1
7872,14962,1.0,1,1
4400,4974,1.0,1,1
13039,15420,1.0,1,1
224,6207,1.0,1,1
473,7345,1.0,1,1


In [143]:
i = 4400
openhouse_3.loc[[i]+list(indexs.loc[i].index)]

Unnamed: 0,muncipality_id,category,listing_price,sales_type,address,longitude,latitude,year_built,retrieved_at,neighbourhood,apt,No.,street
4400,843,Condo,625000.0,sale,"[1000, Rue de la Commune Est, apt. 518]",-73.549257,45.512909,2004.0,2018-04-23,Neighbourhood Old Montréal,518,1000,Rue de la Commune Est
4974,843,Condo / Apartment,0.0,rent,"[1000, Rue de la Commune Est, apt. 518]",-73.549257,45.512909,2004.0,2018-04-14,Neighbourhood Old Montréal,518,1000,Rue de la Commune Est


# process address lenth==2 records

In [19]:
openhouse_2 = openhouse[(len_address==2)]

In [20]:
openhouse_2.loc[:,'street'] = openhouse_2.address.apply(lambda x:x[1])

openhouse_2.loc[:,'No.'] = openhouse_2.address.apply(lambda x:x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [149]:
start = time.time()

pc3 = recordlinkage.BlockIndex(on=['muncipality_id','category','sales_type'])

pairs = pc3.index(openhouse_2)

print 'Index takes:',time.time()-start

print len(pairs)

Index takes: 2.46398806572
1892300


In [151]:
compare_house = recordlinkage.Compare()

compare_house.string('street','street', method='jarowinkler', threshold=0.95, label='street')

compare_house.exact('No.','No.',label='No.')

<Compare>

In [154]:
start = time.time()

features = compare_house.compute(pairs, openhouse_2, openhouse_2)

print 'Take',time.time()-start

Take 10.1757278442


In [156]:
indexs = features[features.sum(axis=1)>1]

In [157]:
indexs

Unnamed: 0,Unnamed: 1,street,No.
419,5076,1.0,1
4429,17210,1.0,1
3327,5181,1.0,1
10552,10626,1.0,1
2671,18064,1.0,1
4685,11853,1.0,1
9203,15424,1.0,1
7304,18214,1.0,1
2655,17171,1.0,1
8672,14448,1.0,1


# process property assessment

In [2]:
property_assess_full = pd.read_csv('montreal-property-assessment.csv',encoding='utf-8')

property_assess_full = property_assess_full.drop_duplicates(['full_address','neighbourhood_unit_number','taxable_value_of_building'])

In [48]:
property_assess = property_assess_full[['full_address','borough','neighbourhood_unit_number','taxable_value_of_building','retrieved_at']]

property_assess.loc[:, 'full_address'] = property_assess.full_address.apply(lambda x:x.replace(' - ','-'))

In [49]:
map_muncipality_code = dict()
for i in collections.Counter(property_assess.borough).keys()[1:]:
    
    print i
    
    dis = pd.Series()
    for j in muncipality_name:
        dis.set_value(j, 
                      jellyfish.jaro_winkler(i.replace('Arrondissement de ','').replace("Arrondissement d'",'')
                                             .replace("Arrondissement du ",''),muncipality_name[j]))
        
    print np.argmax(dis),dis.max(),'\n'
    
    map_muncipality_code[i] = np.argmax(dis)

Arrondissement de LaSalle
820 0.8777777777777778 

Arrondissement de Lachine
823 0.8777777777777778 

Arrondissement de Ville-Marie
843 0.9 

Arrondissement de Côte-des-Neiges - Notre-Dame-de-Grâce
838 0.8842202450898103 

Arrondissement de Pierrefonds - Roxboro
835 0.8838095238095239 

Arrondissement de Saint-Léonard
816 0.9083333333333333 

Arrondissement d'Outremont
825 0.89 

Arrondissement du Plateau-Mont-Royal
842 0.8171296296296297 

Arrondissement de Montréal-Nord
817 0.9083333333333333 

Arrondissement de Rivière-des-Prairies - Pointe-aux-Trembles
846 0.8973389355742297 

Arrondissement de Verdun
819 0.8375 

Arrondissement de Rosemont - La Petite-Patrie
844 0.868074074074074 

Arrondissement de Saint-Laurent


  # Remove the CWD from sys.path while we load stuff.


827 0.9083333333333333 

Arrondissement d'Anjou
815 0.8625 

Arrondissement d'Ahuntsic-Cartierville
840 0.93125 

Arrondissement du Sud-Ouest
839 0.6859903381642511 

Arrondissement de Villeray - Saint-Michel - Parc-Extension
841 0.8496504559270517 

Arrondissement de L'Île-Bizard - Sainte-Geneviève
837 0.876924360400445 

Arrondissement de Mercier - Hochelaga-Maisonneuve
845 0.8849308755760369 



In [54]:
#property_assess['muncipality_id'] = 
muncipality_id = [np.nan] * len(property_assess)
for i in range(len(property_assess)):
    try:
        muncipality_id[i] = map_muncipality_code[property_assess.iloc[i,1]]
    except:
        pass

In [55]:
property_assess.loc[:,'muncipality_id'] = muncipality_id

In [56]:
property_assess

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id
0,6937-6951 Avenue Mousseau,Arrondissement d'Anjou,709.0,315200.0,2018-07-11,815.0
1,"Boulevard Henri-Bourassa Est, LOT ARR.",Arrondissement de Rivière-des-Prairies - Point...,1937.0,483700.0,2018-07-11,846.0
2,9229 Rue Pierre-Bonne,Arrondissement de Rivière-des-Prairies - Point...,1928.0,5500000.0,2018-07-11,846.0
3,4330-4336 Rue de Verdun,Arrondissement de Verdun,1088.0,402500.0,2018-07-11,819.0
4,438-440 Rue Saint-Pierre,Arrondissement de Ville-Marie,2042.0,2945700.0,2018-07-11,843.0
5,"Rue Jean-Talon Est, LOT ARR",Arrondissement de Saint-Léonard,1276.0,901000.0,2018-07-11,816.0
6,901 Rue Maheu,Arrondissement de Saint-Laurent,1354.0,606500.0,2018-07-11,827.0
7,2575 Rue Remembrance,Arrondissement de Lachine,3651.0,613000.0,2018-07-11,823.0
8,8201 Place Marien,,1750.0,4800000.0,2018-07-11,
9,7052 Croissant de la Berge,Arrondissement d'Anjou,719.0,570600.0,2018-07-11,815.0


In [57]:
property_assess.loc[:,'full_address'] = property_assess.full_address.apply(lambda x: x.replace('LOT','').replace('ARR.','').rstrip(', '))

In [58]:
full_address_withcomma = property_assess[property_assess.full_address.apply(lambda x: ',' in x)]

In [59]:
collections.Counter(full_address_withcomma['full_address'].apply(lambda x: len(x.split(','))))

Counter({2: 129078, 3: 2250, 4: 35, 5: 16, 6: 1, 8: 1})

In [60]:
full_address_withcomma[full_address_withcomma['full_address'].apply(lambda x: len(x.split(','))==8)]

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id
144495,"1625 Avenue Lincoln, Suite R23-R29, R23,R24,R2...",Arrondissement de Ville-Marie,2270.0,719300.0,2018-07-11,843.0


In [36]:
full_address_withcomma[full_address_withcomma['full_address'].apply(lambda x: len(x.split(','))==6)]

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id
61944,"60 Rue De Brésoles, Suite 424, 180-412,3,512,3",Arrondissement de Ville-Marie,2125.0,768500.0,2018-07-11,843.0


In [37]:
full_address_withcomma[full_address_withcomma['full_address'].apply(lambda x: len(x.split(','))==4)]

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id
61930,"60 Rue De Brésoles, Suite 220 - 223, CAD 180-2...",Arrondissement de Ville-Marie,2125.0,603000.0,2018-07-11,843.0
61932,"60 Rue De Brésoles, Suite 410, CAD 180-402,502",Arrondissement de Ville-Marie,2125.0,730500.0,2018-07-11,843.0
61936,"60 Rue De Brésoles, Suite 415, CAD 180-405,505",Arrondissement de Ville-Marie,2125.0,352300.0,2018-07-11,843.0
61937,"60 Rue De Brésoles, Suite 416, CAD 180-406,506",Arrondissement de Ville-Marie,2125.0,362000.0,2018-07-11,843.0
61938,"60 Rue De Brésoles, Suite 417, CAD 180-407,507",Arrondissement de Ville-Marie,2125.0,322500.0,2018-07-11,843.0
61941,"60 Rue De Brésoles, Suite 419, CAD 180-409,509",Arrondissement de Ville-Marie,2125.0,371000.0,2018-07-11,843.0
61945,"60 Rue De Brésoles, Suite 425, CAD 180-414,514",Arrondissement de Ville-Marie,2125.0,624000.0,2018-07-11,843.0
61947,"60 Rue De Brésoles, Suite 427, CAD 180-416,516",Arrondissement de Ville-Marie,2125.0,1085200.0,2018-07-11,843.0
61964,"65 Rue Saint-Paul Ouest, Suite 603, CAD 182-60...",Arrondissement de Ville-Marie,2119.0,530200.0,2018-07-11,843.0
61965,"65 Rue Saint-Paul Ouest, Suite 604, CAD 182-60...",Arrondissement de Ville-Marie,2119.0,451900.0,2018-07-11,843.0


In [38]:
full_address_withcomma[full_address_withcomma['full_address'].apply(lambda x: len(x.split(','))==3)]

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id
291,"Chemin Saint-François, , PARTIE DU 1523050",,75.0,0.0,2018-07-11,
1155,"Rue William-Chapman, PARC, GUILLAUME-BRUNEAU",Arrondissement de Saint-Laurent,1356.0,0.0,2018-07-11,827.0
3252,"4250 Rue Saint-Ambroise, Suite 504, ...",Arrondissement du Sud-Ouest,4051.0,500400.0,2018-07-11,839.0
4077,"3603 Rue Saint-Denis, Suite 302, 1387-302",Arrondissement du Plateau-Mont-Royal,2574.0,256100.0,2018-07-11,842.0
4710,"294 Rue du Square-Saint-Louis, Suite 9, 1162-501",Arrondissement du Plateau-Mont-Royal,2492.0,1564200.0,2018-07-11,842.0
8713,"6851 Boulevard des Roseraies, Suite 303, 1",Arrondissement d'Anjou,751.0,231400.0,2018-07-11,815.0
8716,"6851 Boulevard des Roseraies, Suite 406, 1",Arrondissement d'Anjou,751.0,251000.0,2018-07-11,815.0
8717,"6851 Boulevard des Roseraies, Suite 401, 1",Arrondissement d'Anjou,751.0,219900.0,2018-07-11,815.0
8718,"6851 Boulevard des Roseraies, Suite 403, 1",Arrondissement d'Anjou,751.0,232700.0,2018-07-11,815.0
8721,"6851 Boulevard des Roseraies, Suite 506, 1",Arrondissement d'Anjou,751.0,252700.0,2018-07-11,815.0


# full_address with one 1 comma

In [61]:
full_address2 = full_address_withcomma[full_address_withcomma['full_address'].apply(lambda x: len(x.split(','))==2)]

In [62]:
rx = '[' + re.escape(''.join(['(',')','#'])) + ']'
#re.sub(rx,'',x)

In [63]:
full_address2.loc[:,'street_no.']  = full_address2['full_address'].apply(lambda x: x.split(',')[0])
full_address2.loc[:,'apt']  = full_address2['full_address'].apply(lambda x: ' '.join([i for i in re.sub(rx,' ',x.split(',')[1])
                                                        .split(' ') if len(i)>0 and i[0].isnumeric()]))

In [64]:
full_address2

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id,street_no.,apt
5,"Rue Jean-Talon Est, ARR",Arrondissement de Saint-Léonard,1276.0,901000.0,2018-07-11,816.0,Rue Jean-Talon Est,
12,"555 Rue de la Commune Ouest, Suite 409",Arrondissement de Ville-Marie,2150.0,891200.0,2018-07-11,843.0,555 Rue de la Commune Ouest,409
13,"1 Rue McGill, Suite 1101",Arrondissement de Ville-Marie,1999.0,1405900.0,2018-07-11,843.0,1 Rue McGill,1101
25,"2642 Rue Aylwin, Suite 8",Arrondissement de Mercier - Hochelaga-Maisonneuve,2827.0,162300.0,2018-07-11,845.0,2642 Rue Aylwin,8
27,"470 Rue Saint-Alexis, Suite 102",Arrondissement de Ville-Marie,1997.0,163900.0,2018-07-11,843.0,470 Rue Saint-Alexis,102
29,"268 Avenue Lanthier, STAT. EXT. #8",,6161.0,10500.0,2018-07-11,,268 Avenue Lanthier,8
36,"1re Avenue, RANGEMENT",Arrondissement de Rosemont - La Petite-Patrie,3160.0,28300.0,2018-07-11,844.0,1re Avenue,
43,"1420 Rue Sherbrooke Ouest, Suite 803",Arrondissement de Ville-Marie,6260.0,2180000.0,2018-07-11,843.0,1420 Rue Sherbrooke Ouest,803
64,"Boulevard LaSalle, ARR",Arrondissement de Verdun,1099.0,0.0,2018-07-11,819.0,Boulevard LaSalle,
74,"5417-5431 Rue Garnier, STAT.",Arrondissement du Plateau-Mont-Royal,2684.0,18300.0,2018-07-11,842.0,5417-5431 Rue Garnier,


In [65]:
full_address2.loc[full_address2['apt'].apply(lambda x: len(x)==0),'apt'] = np.nan

In [66]:
def get_number(x):
    words = x.split(' ')
    if words[0].replace('-','').isnumeric():
        return words[0],' '.join(words[1:])
    else:
        return np.nan,' '.join(words)

In [67]:
temp = full_address2['street_no.'].apply(lambda x: get_number(x))

In [68]:
full_address2.loc[:,'no.'] = temp.apply(lambda x: x[0])
full_address2.loc[:,'street'] = temp.apply(lambda x: x[1])

In [69]:
full_address2

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id,street_no.,apt,no.,street
5,"Rue Jean-Talon Est, ARR",Arrondissement de Saint-Léonard,1276.0,901000.0,2018-07-11,816.0,Rue Jean-Talon Est,,,Rue Jean-Talon Est
12,"555 Rue de la Commune Ouest, Suite 409",Arrondissement de Ville-Marie,2150.0,891200.0,2018-07-11,843.0,555 Rue de la Commune Ouest,409,555,Rue de la Commune Ouest
13,"1 Rue McGill, Suite 1101",Arrondissement de Ville-Marie,1999.0,1405900.0,2018-07-11,843.0,1 Rue McGill,1101,1,Rue McGill
25,"2642 Rue Aylwin, Suite 8",Arrondissement de Mercier - Hochelaga-Maisonneuve,2827.0,162300.0,2018-07-11,845.0,2642 Rue Aylwin,8,2642,Rue Aylwin
27,"470 Rue Saint-Alexis, Suite 102",Arrondissement de Ville-Marie,1997.0,163900.0,2018-07-11,843.0,470 Rue Saint-Alexis,102,470,Rue Saint-Alexis
29,"268 Avenue Lanthier, STAT. EXT. #8",,6161.0,10500.0,2018-07-11,,268 Avenue Lanthier,8,268,Avenue Lanthier
36,"1re Avenue, RANGEMENT",Arrondissement de Rosemont - La Petite-Patrie,3160.0,28300.0,2018-07-11,844.0,1re Avenue,,,1re Avenue
43,"1420 Rue Sherbrooke Ouest, Suite 803",Arrondissement de Ville-Marie,6260.0,2180000.0,2018-07-11,843.0,1420 Rue Sherbrooke Ouest,803,1420,Rue Sherbrooke Ouest
64,"Boulevard LaSalle, ARR",Arrondissement de Verdun,1099.0,0.0,2018-07-11,819.0,Boulevard LaSalle,,,Boulevard LaSalle
74,"5417-5431 Rue Garnier, STAT.",Arrondissement du Plateau-Mont-Royal,2684.0,18300.0,2018-07-11,842.0,5417-5431 Rue Garnier,,5417-5431,Rue Garnier


# process addresses with no comma 

In [71]:
full_address_nocomma = property_assess[property_assess['full_address'].apply(lambda x: ',' not in x)]

In [81]:
temp = full_address_nocomma['full_address'].apply(lambda x: get_number(x))

In [82]:
full_address_nocomma.loc[:,'no.'] = temp.apply(lambda x: x[0])
full_address_nocomma.loc[:,'street'] = temp.apply(lambda x: x[1])

In [83]:
full_address_nocomma.loc[:,'apt'] = np.nan

In [84]:
full_address_nocomma

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id,no.,street,apt
0,6937-6951 Avenue Mousseau,Arrondissement d'Anjou,709.0,315200.0,2018-07-11,815.0,6937-6951,Avenue Mousseau,
1,Boulevard Henri-Bourassa Est,Arrondissement de Rivière-des-Prairies - Point...,1937.0,483700.0,2018-07-11,846.0,,Boulevard Henri-Bourassa Est,
2,9229 Rue Pierre-Bonne,Arrondissement de Rivière-des-Prairies - Point...,1928.0,5500000.0,2018-07-11,846.0,9229,Rue Pierre-Bonne,
3,4330-4336 Rue de Verdun,Arrondissement de Verdun,1088.0,402500.0,2018-07-11,819.0,4330-4336,Rue de Verdun,
4,438-440 Rue Saint-Pierre,Arrondissement de Ville-Marie,2042.0,2945700.0,2018-07-11,843.0,438-440,Rue Saint-Pierre,
6,901 Rue Maheu,Arrondissement de Saint-Laurent,1354.0,606500.0,2018-07-11,827.0,901,Rue Maheu,
7,2575 Rue Remembrance,Arrondissement de Lachine,3651.0,613000.0,2018-07-11,823.0,2575,Rue Remembrance,
8,8201 Place Marien,,1750.0,4800000.0,2018-07-11,,8201,Place Marien,
9,7052 Croissant de la Berge,Arrondissement d'Anjou,719.0,570600.0,2018-07-11,815.0,7052,Croissant de la Berge,
10,2841 Rue Lake,,3744.0,471100.0,2018-07-11,,2841,Rue Lake,


# concadinate two

In [85]:
prop_main = pd.concat([full_address2,full_address_nocomma]).sort_index()
len(prop_main)

484494

In [161]:
prop_main['street'] = prop_main['street'].apply(lambda x:x.replace('LOT','').replace('ARR.','').rstrip(', '))

In [86]:
prop_main

Unnamed: 0,apt,borough,full_address,muncipality_id,neighbourhood_unit_number,no.,retrieved_at,street,street_no.,taxable_value_of_building
0,,Arrondissement d'Anjou,6937-6951 Avenue Mousseau,815.0,709.0,6937-6951,2018-07-11,Avenue Mousseau,,315200.0
1,,Arrondissement de Rivière-des-Prairies - Point...,Boulevard Henri-Bourassa Est,846.0,1937.0,,2018-07-11,Boulevard Henri-Bourassa Est,,483700.0
2,,Arrondissement de Rivière-des-Prairies - Point...,9229 Rue Pierre-Bonne,846.0,1928.0,9229,2018-07-11,Rue Pierre-Bonne,,5500000.0
3,,Arrondissement de Verdun,4330-4336 Rue de Verdun,819.0,1088.0,4330-4336,2018-07-11,Rue de Verdun,,402500.0
4,,Arrondissement de Ville-Marie,438-440 Rue Saint-Pierre,843.0,2042.0,438-440,2018-07-11,Rue Saint-Pierre,,2945700.0
5,,Arrondissement de Saint-Léonard,"Rue Jean-Talon Est, ARR",816.0,1276.0,,2018-07-11,Rue Jean-Talon Est,Rue Jean-Talon Est,901000.0
6,,Arrondissement de Saint-Laurent,901 Rue Maheu,827.0,1354.0,901,2018-07-11,Rue Maheu,,606500.0
7,,Arrondissement de Lachine,2575 Rue Remembrance,823.0,3651.0,2575,2018-07-11,Rue Remembrance,,613000.0
8,,,8201 Place Marien,,1750.0,8201,2018-07-11,Place Marien,,4800000.0
9,,Arrondissement d'Anjou,7052 Croissant de la Berge,815.0,719.0,7052,2018-07-11,Croissant de la Berge,,570600.0


In [171]:
house_main = pd.concat([openhouse_2, openhouse_3]).sort_index()

In [172]:
np.unique(house_main.muncipality_id)

array([814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826,
       827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839,
       840, 841, 842, 843, 844, 845, 846, 943])

In [173]:
prop_main_withID = prop_main[~prop_main['muncipality_id'].isna()]

len(prop_main_withID)

409692

In [93]:
start = time.time()

pc = recordlinkage.BlockIndex(on=['muncipality_id'])

pairs = pc.index(house_main,prop_main_withID[:50000])

print 'Index takes:',time.time()-start

print len(pairs)

Index takes: 93.7948889732
91380092


In [95]:
type(pairs)

pandas.core.indexes.multi.MultiIndex

In [174]:
L = [j for i in prop_main.street.apply(lambda x:x.split(' ')) for j in i]

c = collections.Counter(L)

prop_main.loc[:,'street_ind'] = prop_main.street.apply(lambda x: x.split(' ')
                                                       [np.argsort([c[i] for i in x.split(' ')])[0]])

prop_main_withID = prop_main[~prop_main['muncipality_id'].isna()]

In [175]:
L = [j for i in house_main.street.apply(lambda x:x.split(' ')) for j in i]

c = collections.Counter(L)

house_main.loc[:,'street_ind'] = house_main.street.apply(lambda x: x.split(' ')
                                                         [np.argsort([c[i] for i in x.split(' ')])[0]])

#muncipality_name[muncipality_id] = c.most_common()[0][0]

In [206]:
start = time.time()

pc = recordlinkage.SortedNeighbourhoodIndex(on='street_ind')

pairs = pc.index(house_main,prop_main)

print 'Index takes:',time.time()-start

print len(pairs)

Index takes: 38.2076702118
31994934


In [221]:
start = time.time()

pc = recordlinkage.SortedNeighbourhoodIndex(on='street_ind')

pairs2 = pc.index(house_main,prop_main_withID)

print 'Index takes:',time.time()-start

print len(pairs2)

Index takes: 53.7226970196
29785674


In [177]:
compare_house = recordlinkage.Compare()

compare_house.string('street','street', method='jarowinkler', threshold=0.95, label='street')

compare_house.exact('apt','apt',label='apt')

compare_house.exact('No.','No.',label='No.')

<Compare>

In [207]:
prop_main.rename(columns={'no.':'No.'},inplace=True)

In [214]:
house_main.to_csv('house.csv',encoding='utf-8')

In [215]:
prop_main.to_csv('prop.csv',encoding='utf-8')

In [216]:
house_main = pd.read_csv('house.csv',encoding='utf-8')

prop_main = pd.read_csv('prop.csv',encoding='utf-8')

In [217]:
prop_main

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,apt,borough,full_address,muncipality_id,neighbourhood_unit_number,No.,retrieved_at,street,street_no.,taxable_value_of_building,street_ind
0,0,0,,Arrondissement d'Anjou,6937-6951 Avenue Mousseau,815.0,709.0,6937-6951,2018-07-11,Avenue Mousseau,,315200.0,Mousseau
1,1,1,,Arrondissement de Rivière-des-Prairies - Point...,Boulevard Henri-Bourassa Est,846.0,1937.0,,2018-07-11,Boulevard Henri-Bourassa Est,,483700.0,Henri-Bourassa
2,2,2,,Arrondissement de Rivière-des-Prairies - Point...,9229 Rue Pierre-Bonne,846.0,1928.0,9229,2018-07-11,Rue Pierre-Bonne,,5500000.0,Pierre-Bonne
3,3,3,,Arrondissement de Verdun,4330-4336 Rue de Verdun,819.0,1088.0,4330-4336,2018-07-11,Rue de Verdun,,402500.0,Verdun
4,4,4,,Arrondissement de Ville-Marie,438-440 Rue Saint-Pierre,843.0,2042.0,438-440,2018-07-11,Rue Saint-Pierre,,2945700.0,Saint-Pierre
5,5,5,,Arrondissement de Saint-Léonard,"Rue Jean-Talon Est, ARR",816.0,1276.0,,2018-07-11,Rue Jean-Talon Est,Rue Jean-Talon Est,901000.0,Jean-Talon
6,6,6,,Arrondissement de Saint-Laurent,901 Rue Maheu,827.0,1354.0,901,2018-07-11,Rue Maheu,,606500.0,Maheu
7,7,7,,Arrondissement de Lachine,2575 Rue Remembrance,823.0,3651.0,2575,2018-07-11,Rue Remembrance,,613000.0,Remembrance
8,8,8,,,8201 Place Marien,,1750.0,8201,2018-07-11,Place Marien,,4800000.0,Marien
9,9,9,,Arrondissement d'Anjou,7052 Croissant de la Berge,815.0,719.0,7052,2018-07-11,Croissant de la Berge,,570600.0,Berge


In [222]:
start = time.time()

features = compare_house.compute(pairs2, house_main.fillna(value=-1), prop_main_withID.fillna(value=-1))

print 'Take',time.time()-start

Take 1148.7806201


In [223]:
indexs = features[features.sum(axis=1)>2]

In [None]:
index

In [200]:
np.unique(indexs.index.get_level_values(0))

array([    1,     2,     4, ..., 34646, 34653, 34655])

In [202]:
house_main.loc[0]

No.                                      7
address                 [7, Place Raymond]
apt                                    NaN
category                             House
latitude                           45.4438
listing_price                    1.199e+06
longitude                         -73.8923
muncipality_id                         830
neighbourhood     Neighbourhood North West
retrieved_at                    2018-07-25
sales_type                            sale
street                       Place Raymond
year_built                            1999
street_ind                         Raymond
Name: 0, dtype: object

In [204]:
property_assess[property_assess.full_address == '7 Place Raymond']

Unnamed: 0,full_address,borough,neighbourhood_unit_number,taxable_value_of_building,retrieved_at,muncipality_id
323305,7 Place Raymond,,207.0,732400.0,2018-07-12,


In [220]:
prop_main_withID = prop_main[~prop_main['muncipality_id'].isna()]