## Finding Fake/Duplicate Records

Commonly known as Data Dedupication & Record Linkage

In [1]:
# Download and unzip data
# !wget http://data.insideairbnb.com/united-kingdom/england/london/2023-03-14/data/listings.csv.gz -P ../data/
# !gunzip -kf ../data/listings.csv.gz

In [2]:
# !pip install nltk

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import random

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eessnin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eessnin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from datetime import datetime, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice

%matplotlib inline

In [3]:
df = pd.read_csv('../data/listings.csv')

In [4]:
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,20230314070633,2023-03-14,city scrape,Lovely private bedroom in Muswell Hill.,Take a break and unwind at this peaceful oasis.,,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,...,,,,,f,1,0,1,0,
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,20230314070633,2023-03-14,city scrape,Studio Flat Franklin London,Brand New Modern Kitchen<br />Close to Excelle...,,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,...,,,,,t,7,7,0,0,
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,20230314070633,2023-03-14,city scrape,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,Enjoy easy access to everything from this perf...,,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,...,,,,,t,4,4,0,0,
3,3518856,https://www.airbnb.com/rooms/3518856,20230314070633,2023-03-14,city scrape,Wimbledon Double Bedroom Ensuite,A welcoming and stylish 2 bedroom 2 bathroom f...,,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,...,5.0,3.67,3.67,,f,2,0,2,0,0.05
4,4876550,https://www.airbnb.com/rooms/4876550,20230314070633,2023-03-14,city scrape,Stunning Apartment 2 minutes walk to Tube Station,Luxury Modern Apartment in modern development ...,,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,...,,,,,f,1,1,0,0,


In [5]:
# Since this is a data deduplication problem, we're interested in data uploaded by the host, let's print list of columns to find out those columns
df.shape

(75241, 75)

In [6]:
# Fillna
df['name'].fillna('', inplace=True)
df['description'].fillna('', inplace=True)

In [7]:
# tfidf calculation
text_content = df['name'] + ' ' + df['description']

#remove HTML tags
text_content = text_content.str.replace(r'<[^<>]*>', '', regex=True)

# Create vectorizer
vector = TfidfVectorizer(max_df=0.3,         # drop words that occur in more than X percent of documents
                             #min_df=8,      # only use words that appear at least X times
                             stop_words='english', # remove stop words
                             lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True # Prevents divide-by-zero errors
                            )
tfidf = vector.fit_transform(text_content)

In [12]:
# Request function : search the top_n articles from a request ( request = string)
def search(tfidf_matrix,model,request, top_n = 5):
    request_transform = model.transform([request])
    similarity = np.dot(request_transform,np.transpose(tfidf_matrix))
    x = np.array(similarity.toarray()[0])
    indices=np.argsort(x)[-5:][::-1]
    return indices, x[indices]

# Find similar : get the top_n articles similar to an article 
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n]    

# Print the result
def print_result(ind, request_content, indices, scores, X, min_score = 0.5):
    print('\nSearch : ' + str(ind) + ', ' + request_content)
    print('\nBest Results :')
    no_match = True
    for i in range(len(indices)):
        if (scores[i] > min_score) and (ind != indices[i]):
            no_match = False
            print('\nid = {0:5d} - {1} -> {2}'.format(indices[i], scores[i], X.loc[indices[i]]))
    if no_match:
        print('None')

In [17]:
# Minimum similarity score to qualify record as a match
MIN_MATCH_THRESHOLD = 0.6
# Number of tests to conduct 
NUM_RANDOM_TESTS = 15
test_indices = random.sample(range(len(text_content)), NUM_RANDOM_TESTS)
for ind in test_indices:
    print("---------------------------------------")
    request = text_content[ind]
    result, scores = search(tfidf,vector, request, top_n = 5)
    print_result(ind, request, result, scores, text_content, min_score=MIN_MATCH_THRESHOLD)
    print("---------------------------------------")
    

---------------------------------------

Search : 13829, Central London, Big Room W/ Garden Modern, cozy house in central London, in green estate with trees and parks close by.Massive spacious room. Patio doors opens to large garden space which is rarely used.Great location 5/10 minute bus or tube to Westminster/SohoCity or ClaphamThe spaceThe room being rented is the room overlooking the garden as shown.  The room is on the ground floor where the other 3 bedrooms are upstairs.You will be right next to a downstairs toilet which is handy as the second toilet and shower room is upstairs.Guest accessYou can use kitchen and the bath/shower room and toilet upstairs. There is a toilet downstairs next to the room you'll be staying in.

Best Results :
None
---------------------------------------
---------------------------------------

Search : 1188, Cosy Room W/ Home Music Studio Setup This is a private room for artists looking for a cosy space w/facilities to hold small music sessions and re


Search : 12974, Beautiful and clean room in Hackney Beautiful bedroom available in a terraced house located on a quiet street in the best spot of Hackney just in between London Fields and De Beauvoir. Wooden floors, new bed and lots of light.The spaceThe bedroom has a large south-facing window which ensures plenty of day light. It has just been refurbished with wooden floor. Furniture are brand new (bed, mattress, bedside table, desk and a large armchair). It comes with a private bathroom  just next door (where you can enjoy a hot tub)!Other things to notePlease let us know if you have any question or if you need any info.  we will be happy to help and share with you some tips on bars/restaurant and the local area.

Best Results :
None
---------------------------------------
---------------------------------------

Search : 32269, A cosy double bedroom (Olympic Park and Westfield) Looking for a comfortable place to stay not far from Central London? Then your search ends here. We offer