In [1]:
import pandas as pd

df = pd.read_csv('unique_shops_list.csv')

df

Unnamed: 0,Shop Names
0,#RAVE KARAOKE
1,#THEBACKYARDBAKERSHQ
2,$KY MONEY CHANGER
3,% ARABICA
4,& OTHER STORIES
...,...
4926,旺爐 CITY HOT POT
4927,牛一嘴 NÚODLE
4928,補.沈希SHEN XI IMPERIAL SOUP
4929,霸王茶姬 CHAGEE


In [2]:
import difflib
from collections import defaultdict

def find_similar_names_from_df(df, column_name, threshold=0.8):
    """
    Find similar shop names within a DataFrame based on a similarity threshold.

    Args:
    - df (DataFrame): The DataFrame containing shop names.
    - column_name (str): The name of the column in the DataFrame that contains the shop names.
    - threshold (float): The similarity threshold for considering names as similar.

    Returns:
    - dict: A dictionary where keys are shop names and values are lists of similar names.
    """
    shop_names = df[column_name].unique()  # Extract unique shop names from the specified column
    similar_names = defaultdict(list)
    
    for name1 in shop_names:
        for name2 in shop_names:
            if name1 != name2:
                similarity = difflib.SequenceMatcher(None, name1, name2).ratio()
                if similarity > threshold:
                    similar_names[name1].append(name2)
    return similar_names

In [3]:
similar_shop_names = find_similar_names_from_df(df, 'Shop Names', threshold=0.8)

# Printing similar shop names
for name, similarities in similar_shop_names.items():
    print(f"{name} is similar to: {similarities}")

#THEBACKYARDBAKERSHQ is similar to: ['THE BACKYARD BAKERS']
$KY MONEY CHANGER is similar to: ['FAUZY MONEY CHANGER']
12HRS MONEY CHANGER is similar to: ['SINO MONEY CHANGER']
133 BEAUTY SALON is similar to: ['TAY BEAUTY SALON']
21 MEDICAL AESTHETICS CLINIC is similar to: ['APAX MEDICAL & AESTHETICS CLINIC', 'EDWIN LIM MEDICAL AESTHETIC CLINIC', 'V MEDICAL AESTHETICS & LASER CLINIC', 'VALE MEDICAL AESTHETICS CLINIC']
2NE HAIR STUDIO is similar to: ['DE HAIR STUDIO', 'ELAN HAIR STUDIO', 'IMAGE HAIR STUDIO']
3T MOBILE STORY is similar to: ['3TMOBILESTORY']
3TMOBILESTORY is similar to: ['3T MOBILE STORY']
55 FASHION is similar to: ['E FASHION']
7 ELEVEN is similar to: ['7-ELEVEN']
7-ELEVEN is similar to: ['7 ELEVEN']
8 TREASURES is similar to: ['8TREASURES']
8TREASURES is similar to: ['8 TREASURES']
@AT TEA is similar to: ['AT TEA']
A.M AESTHETICS is similar to: ['MODE AESTHETICS', 'MOZA AESTHETICS']
A.R. MONEY EXCHANGE is similar to: ['A.R.J. MONEY CHANGER']
A.R.J. MONEY CHANGER is simila

In [6]:
similar_shop_df = pd.DataFrame(similar_shop_names.items(), columns=['Shop Name', 'Similar Names'])
similar_shop_df


Unnamed: 0,Shop Name,Similar Names
0,#THEBACKYARDBAKERSHQ,[THE BACKYARD BAKERS]
1,$KY MONEY CHANGER,[FAUZY MONEY CHANGER]
2,12HRS MONEY CHANGER,[SINO MONEY CHANGER]
3,133 BEAUTY SALON,[TAY BEAUTY SALON]
4,21 MEDICAL AESTHETICS CLINIC,"[APAX MEDICAL & AESTHETICS CLINIC, EDWIN LIM M..."
...,...,...
653,ZARA,[OZARA]
654,ZEN BEAUTY,"[EBEAUTY, EX BEAUTY, VENUS BEAUTY]"
655,ZHANG LIANG MALA TANG,[TANG TANG MALATANG]
656,ZR FASHION,"[E FASHION, MARY FASHION]"


In [7]:
similar_shop_df.to_csv("similar_shop_data.csv", index=False)


# Let's try converting it into a CSV

In [None]:
import json

# Read the data from the file
with open('merged_data_restructured.json', 'r') as f:
    all_malls_data = json.load(f)



In [None]:
df = pd.DataFrame(all_malls_data)
df

Unnamed: 0,Clarke Quay Central,Holland Village Shopping Mall,Square 2,Greenwich V,HillV2,Icon Village,Junction 10,Far East Square,Katong V,Lucky Chinatown,...,Ngee Ann City,Orchard Central,Bugis+,The Star Vista,Funan,Far East Plaza,Queensway Shopping Centre,Katong SC,leisure park,100 AM
name,Clarke Quay Central,Holland Village Shopping Mall,Square 2,Greenwich V,HillV2,Icon Village,Junction 10,Far East Square,Katong V,Lucky Chinatown,...,Ngee Ann City,Orchard Central,Bugis+,The Star Vista,Funan,Far East Plaza,Queensway Shopping Centre,Katong SC,leisure park,100 AM
shops,"[1ST EYE CARE, 3DSENSE MEDIA SCHOOL, A.M AESTH...","[ALLSMILES DENTAL CARE (COMING SOON), ART WORK...","[20DB DIGISOUND, 7-ELEVEN, AILISA WELLNESS, AL...","[7-ELEVEN, ANYTIME FITNESS, AWFULLY CHOCOLATE,...","[7-ELEVEN, ANGLICAN SENIOR CENTRE, ANYTIME FIT...","[123 ZÔ, 7-ELEVEN, AERIAL ARTS COLLECTIVE, ALI...","[A KITCHEN, APAX MEDICAL & AESTHETICS CLINIC, ...","[#THEBACKYARDBAKERSHQ, 88 HONG KONG ROAST MEAT...","[ART VILLAGE GALLERY & STUDIO, ART ZONE, BAO Z...","[AI JEWELLERY, ALIVE TATTOO STUDIO, CHANG BAI ...",...,"[Watsons, Guardian, Spectacle Hut, KOI The, MO...","[Uniqlo, Genki Sushi, Q&M Dental Surgery, Gadg...","[Starbucks, KOI The, LiHO Tea, OWNDAYS, 7-Elev...","[Watsons, Guardian, KOI The, LiHO Tea, Subway,...","[Watsons, Guardian, LAC, Old Chang Kee, Ya Kun...","[Watsons, Old Chang Kee, Ya Kun Kaya Toast, Li...","[LiHO Tea, McDonald's, Levi's, Glimpse, Under ...","[Standard Photo, Singapore Pools, Teo Heng KTV...","[Athletic United Taekwondo, Song Yue Taiwan Cu...","[Beans.Factory SG, BK Aesthetics Clinic, BK Be..."


In [None]:
df_transposed = df.transpose()
df_transposed


Unnamed: 0,name,shops
Clarke Quay Central,Clarke Quay Central,"[1ST EYE CARE, 3DSENSE MEDIA SCHOOL, A.M AESTH..."
Holland Village Shopping Mall,Holland Village Shopping Mall,"[ALLSMILES DENTAL CARE (COMING SOON), ART WORK..."
Square 2,Square 2,"[20DB DIGISOUND, 7-ELEVEN, AILISA WELLNESS, AL..."
Greenwich V,Greenwich V,"[7-ELEVEN, ANYTIME FITNESS, AWFULLY CHOCOLATE,..."
HillV2,HillV2,"[7-ELEVEN, ANGLICAN SENIOR CENTRE, ANYTIME FIT..."
...,...,...
Far East Plaza,Far East Plaza,"[Watsons, Old Chang Kee, Ya Kun Kaya Toast, Li..."
Queensway Shopping Centre,Queensway Shopping Centre,"[LiHO Tea, McDonald's, Levi's, Glimpse, Under ..."
Katong SC,Katong SC,"[Standard Photo, Singapore Pools, Teo Heng KTV..."
leisure park,leisure park,"[Athletic United Taekwondo, Song Yue Taiwan Cu..."


In [None]:
df_transposed.to_csv("all_malls_data.csv", index=False)

# ChatGPT asked me to used NLP, so here I am. What I wwant to do is a final check that the shop names are matching and there aren't shop names I did not account for

In [10]:
import json
from fuzzywuzzy import process
from collections import defaultdict

def find_similar_names(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    # Flatten the list of all shop names
    all_shop_names = [shop for mall in data.values() for shop in mall['shops']]

    # Dictionary to hold shops and their similar matches
    similar_shops = defaultdict(list)

    # Set a threshold for considering names as similar
    similarity_threshold = 80

    # Compare each shop name against all others to find similarities
    for shop in all_shop_names:
        matches = process.extract(shop, all_shop_names, limit=None)
        for match in matches:
            matched_name, similarity_score = match[0], match[1]
            # If similarity is above the threshold and not the same name, consider them similar
            if similarity_score >= similarity_threshold and matched_name != shop:
                similar_shops[shop].append(matched_name)

    # Filter out shops with no similar matches
    similar_shops = {shop: matches for shop, matches in similar_shops.items() if matches}
    
    return similar_shops

In [11]:
# Replace 'your_json_file_path.json' with the path to your JSON file
similar_shops = find_similar_names('final_updated_malls_data.json')

# Print or process the similar shops as needed
print(similar_shops)


{'1ST EYE CARE': ['ONEDAY HEAD CARE & WELLNESS', 'ALLSMILES DENTAL CARE', "NUH CHILDREN'S URGENT CARE CLINIC @ BUKIT PANJANG", 'O.H EYE', 'FAMILY CARE PHYSIOTHERAPY CLINIC', 'CLARITI - HEARING CARE PROFESSIONALS', 'SHINAGAWA EYE CENTRE', 'PURETE FACE & BODY CARE', '1ST PRIZE HOME DIY', 'EYE CLINIC AND OPTOMETRY CENTRE', 'EUPHORIA BEAUTY CARE', 'ONCE CARE REFLEXOLOGY & WELLNESS'], '3DSENSE MEDIA SCHOOL': ['JAN & ELLY ENGLISH LANGUAGE SCHOOL', 'BERRIES WORLD OF LEARNING SCHOOL', 'BERRIES WORLD OF LEARNING SCHOOL', 'MY ART SCHOOL', 'SOPHIA INTERNATIONAL BEAUTY SCHOOL'], 'A.M AESTHETICS': ['AURA MEDICAL AESTHETICS', 'FUR A VETRESKA PET STORE', 'A KITCHEN', 'APAX MEDICAL & AESTHETICS CLINIC', 'ELITE SKIN AESTHETICS', 'M&G AESTHETICS BODY SCULPTING WELLNESS', 'Q & M', 'NEUGLOW THE AESTHETICS DOCTORS', 'V MEDICAL AESTHETICS & LASER CLINIC', '21 MEDICAL AESTHETICS CLINIC', 'CITI BEAUTY AESTHETICS', 'M.G.M FOREIGN EXCHANGE', 'J & A BEAUTY NAIL PALOUR', 'M & Y', 'TONI INTERNATIONAL COLLEGE OF AE

In [13]:
len(similar_shops)

3979

In [14]:
df_similar_shops = pd.DataFrame(similar_shops.items(), columns=['Shop Name', 'Similar Names'])

df_similar_shops

Unnamed: 0,Shop Name,Similar Names
0,1ST EYE CARE,"[ONEDAY HEAD CARE & WELLNESS, ALLSMILES DENTAL..."
1,3DSENSE MEDIA SCHOOL,"[JAN & ELLY ENGLISH LANGUAGE SCHOOL, BERRIES W..."
2,A.M AESTHETICS,"[AURA MEDICAL AESTHETICS, FUR A VETRESKA PET S..."
3,ADVENTURE 21,"[21 MEDICAL AESTHETICS CLINIC, CLUB 21, YACHT ..."
4,ANGELS & CO. OSSTEM DENTAL IMPLANT CENTRE,"[NEWLIFE DENTAL PRACTICE, ALLSMILES DENTAL CAR..."
...,...,...
3974,THE SPACE KOREAN HAIR SALON,"[SALO, AUBE BEAUTY SALON, E!GHT KOREAN BBQ, EL..."
3975,THE XIANG PAVILION – AUTHENTIC HUNAN CUISINE,"[THE HENG'S JEWELLERY, THE JOY OF TOYS, THE PO..."
3976,URA HOTOTOGISU,"[OTO, OTO, TOG, TOG, OTO, TOG, OTO, TOG, TOG, ..."
3977,XIN HUA TCM THERAPY,"[KJ THERAPY, BAOZHITANG CHINESE MEDICINE AND T..."


In [15]:
df_similar_shops.to_csv("similar_shops_data.csv", index=False)
