In [197]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [198]:
df = pd.read_csv('appartments.csv').drop(22)

In [199]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [200]:
df.shape

(246, 7)

In [201]:
df['PropertySubName'][0]

'2, 3, 4 BHK Apartment in Sector 113, Gurgaon'

In [202]:
df['NearbyLocations'][0]

"['Bajghera Road', 'Palam Vihar Halt', 'DPSG Palam Vihar', 'Park Hospital', 'Gurgaon Railway Station']"

In [203]:
df['LocationAdvantages'][0] # most use ful, we can use  this data for recommender

"{'Bajghera Road': '800 Meter', 'Palam Vihar Halt': '2.5 KM', 'DPSG Palam Vihar': '3.1 KM', 'Park Hospital': '3.1 KM', 'Gurgaon Railway Station': '4.9 KM', 'The NorthCap University': '5.4 KM', 'Dwarka Expy': '1.2 KM', 'Hyatt Place Gurgaon Udyog Vihar': '7.7 KM', 'Dwarka Sector 21, Metro Station': '7.2 KM', 'Pacific D21 Mall': '7.4 KM', 'Indira Gandhi International Airport': '14.7 KM', 'Hamoni Golf Camp': '6.2 KM', 'Fun N Food Waterpark': '8.8 KM', 'Accenture DDC5': '9 KM'}"

In [204]:
df['PriceDetails'][0] # use ful for recommender

"{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}"

In [205]:
df['TopFacilities'][0] # useful

"['Swimming Pool', 'Salon', 'Restaurant', 'Spa', 'Cafeteria', 'Sun Deck', '24x7 Security', 'Club House', 'Gated Community']"

In [206]:
def extract_to_list(data):
    return re.findall(r"'(.*?)'",data)

df['TopFacilities'] = df['TopFacilities'].apply(extract_to_list)

In [207]:
df[['LocationAdvantages','PriceDetails','TopFacilities']]

Unnamed: 0,LocationAdvantages,PriceDetails,TopFacilities
0,"{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...","{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Salon, Restaurant, Spa, Cafete..."
1,"{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...","{'3 BHK': {'building_type': 'Apartment', 'area...","[Bowling Alley, Mini Theatre, Manicured Garden..."
2,"{'AIPL Business Club Sector 62': '2.7 Km', 'He...",{'3 BHK': {'building_type': 'Independent Floor...,"[Terrace Garden, Gazebo, Fountain, Amphitheatr..."
3,"{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...","{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Volley Ball Court, Aerobics Ce..."
4,"{'Pranavananda Int. School': '450 m', 'DLF Sit...",{'2 BHK': {'building_type': 'Independent Floor...,"[Mini Theatre, Doctor on Call, Concierge Servi..."
...,...,...,...
242,"{'Sector 42-43 Metro Station': '1.8 Km', 'Para...","{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Medical Centre, Laundry, Salon..."
243,{'Aarvy Healthcare Super Speciality': '1.8 KM'...,"{'1 BHK': {'building_type': 'Apartment', 'area...","[Shopping Centre, Community Hall, 24x7 Securit..."
244,"{'Dwarka Expressway': '1.2 Km', 'S N Internati...","{'2 BHK': {'building_type': 'Apartment', 'area...","[Bus Shelter, Swimming Pool, Business Lounge, ..."
245,"{'Suncity School': '0.2 Km', 'Gurugram Road': ...","{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Card Room, Piped Gas, Pool Tab..."


In [208]:
df['FacilitiesStr'] = df['TopFacilities'].apply(' '.join)

In [209]:
df['FacilitiesStr'][0]

'Swimming Pool Salon Restaurant Spa Cafeteria Sun Deck 24x7 Security Club House Gated Community'

In [210]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))

In [211]:
tfidf_vectorizer

In [212]:
tfidf_vector = tfidf_vectorizer.fit_transform(df['FacilitiesStr'])

In [213]:
tfidf_vector.toarray()[0]

array([0.        , 0.        , 0.        , 0.18809342, 0.18809342,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [214]:
cosine_sim1 = cosine_similarity(tfidf_vector, tfidf_vector)

In [215]:
cosine_sim1.shape #  first target accomplish

(246, 246)

In [216]:
# df[['PropertyName','PriceDetails']]['PriceDetails'][0]

In [217]:
idx = df.index[df['PropertyName'] == 'Ace Palm Floors'].tolist()[0]

In [218]:
sim_scores = list(enumerate(cosine_sin1[idx]))

In [219]:
sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

In [220]:
# property_indices = [i[0] for i in sim_scores]
    
# recommendations_df = pd.DataFrame({
#         'PropertyName': df['PropertyName'].iloc[property_indices],
#         'SimilarityScore': sim_scores
# })

In [221]:
def recommend_properties(property_name):
    # Get the index of the property that matches the name
    idx = df.index[df['PropertyName'] == property_name].tolist()[0]

    # Get the pairwise similarity scores with that property
    sim_scores = list(enumerate(cosine_sim1[idx]))

    # Sort the properties based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar properties
    sim_scores = sim_scores[1:6]

    # Get the property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendations_df = pd.DataFrame({
        'PropertyName': df['PropertyName'].iloc[property_indices],
        'SimilarityScore': sim_scores
    })

    # Return the top 10 most similar properties
    return recommendations_df

In [222]:
recommend_properties("DLF The Arbour")

Unnamed: 0,PropertyName,SimilarityScore
64,Ace Palm Floors,"(63, 0.45293820624419556)"
217,Yashika 104,"(216, 0.41996063229267827)"
93,JMS The Nation,"(92, 0.4166584649363288)"
154,India Rashtra,"(153, 0.39895423468019414)"
0,Smartworld One DXP,"(0, 0.388850461994329)"


In [223]:
# start working on Price Detail
df['PriceDetails']

0      {'2 BHK': {'building_type': 'Apartment', 'area...
1      {'3 BHK': {'building_type': 'Apartment', 'area...
2      {'3 BHK': {'building_type': 'Independent Floor...
3      {'2 BHK': {'building_type': 'Apartment', 'area...
4      {'2 BHK': {'building_type': 'Independent Floor...
                             ...                        
242    {'2 BHK': {'building_type': 'Apartment', 'area...
243    {'1 BHK': {'building_type': 'Apartment', 'area...
244    {'2 BHK': {'building_type': 'Apartment', 'area...
245    {'2 BHK': {'building_type': 'Apartment', 'area...
246    {'2 BHK': {'building_type': 'Apartment', 'area...
Name: PriceDetails, Length: 246, dtype: object

In [224]:
import pandas as pd
import json

# Load the dataset
df_appartments = pd.read_csv('appartments.csv').drop(22)

# Function to parse and extract the required features from the PriceDetails column
def refined_parse_modified_v2(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for bhk, detail in details.items():
        # Extract building type
        extracted[f'building type_{bhk}'] = detail.get('building_type')

        # Parsing area details
        area = detail.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {bhk}'] = value
                extracted[f'area high {bhk}'] = value
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {bhk}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {bhk}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None

        # Parsing price details
        price_range = detail.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {bhk}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {bhk}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {bhk}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {bhk}'] /= 100
            except:
                extracted[f'price low {bhk}'] = None
                extracted[f'price high {bhk}'] = None

    return extracted
# Apply the refined parsing and generate the new DataFrame structure
data_refined = []

for _, row in df_appartments.iterrows():
    features = refined_parse_modified_v2(row['PriceDetails'])
    
    # Construct a new row for the transformed dataframe
    new_row = {'PropertyName': row['PropertyName']}
    
    # Populate the new row with extracted features
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')
    
    data_refined.append(new_row)

df_final_refined_v2 = pd.DataFrame(data_refined).set_index('PropertyName')
df_final_refined_v2

Unnamed: 0_level_0,building type_1 BHK,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,building type_2 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,...,building type_1 RK,area low 1 RK,area high 1 RK,price low 1 RK,price high 1 RK,building type_Land,area low Land,area high Land,price low Land,price high Land
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,,,,,,Apartment,1370.0,1370.0,2.0000,2.40,...,,,,,,,,,,
M3M Crown,,,,,,,,,,,...,,,,,,,,,,
Adani Brahma Samsara Vilasa,,,,,,,,,,,...,,,,,,,500.0,4329.0,2.05,41.13
Sobha City,,,,,,Apartment,1381.0,1692.0,1.5500,3.21,...,,,,,,,,,,
Signature Global City 93,,,,,,Independent Floor,981.0,1118.0,0.9301,1.06,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,,,,,,Apartment,964.0,964.0,,,...,,,,,,,,,,
Pyramid Urban Homes 2,Apartment,335.0,398.0,23.45,0.2786,Apartment,500.0,625.0,,,...,,,,,,,,,,
Satya The Hermitage,,,,,,Apartment,1450.0,1450.0,,,...,,,,,,,,,,
BPTP Spacio,,,,,,Apartment,1000.0,1079.0,,,...,,,,,,,,,,


In [225]:
df_final_refined_v2['building type_Land'] = df_final_refined_v2['building type_Land'].replace({'':'Land'})

In [226]:
df_final_refined_v2['building type_Land']

PropertyName
Smartworld One DXP             None
M3M Crown                      None
Adani Brahma Samsara Vilasa    Land
Sobha City                     None
Signature Global City 93       None
                               ... 
DLF Princeton Estate           None
Pyramid Urban Homes 2          None
Satya The Hermitage            None
BPTP Spacio                    None
SS The Coralwood               None
Name: building type_Land, Length: 246, dtype: object

In [227]:
categorical_columns = df_final_refined_v2.select_dtypes(include=['object']).columns.tolist()

In [228]:
ohe_df = pd.get_dummies(df_final_refined_v2, columns=categorical_columns, drop_first=True)

In [229]:
ohe_df.fillna(0, inplace=True)

In [230]:
ohe_df

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,0.0,0.0,0.00,0.0000,1370.0,1370.0,2.0000,2.40,1850.0,2050.0,...,0,0,0,0,0,0,0,0,0,0
M3M Crown,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1605.0,2170.0,...,0,0,0,0,0,0,0,0,0,0
Adani Brahma Samsara Vilasa,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1800.0,3150.0,...,0,0,1,0,0,1,0,0,0,0
Sobha City,0.0,0.0,0.00,0.0000,1381.0,1692.0,1.5500,3.21,1711.0,2343.0,...,0,0,0,0,0,0,0,0,0,0
Signature Global City 93,0.0,0.0,0.00,0.0000,981.0,1118.0,0.9301,1.06,1235.0,1530.0,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.0,0.0,0.00,0.0000,964.0,964.0,0.0000,0.00,1127.0,1127.0,...,0,0,0,0,0,0,0,0,0,0
Pyramid Urban Homes 2,335.0,398.0,23.45,0.2786,500.0,625.0,0.0000,0.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Satya The Hermitage,0.0,0.0,0.00,0.0000,1450.0,1450.0,0.0000,0.00,1991.0,1991.0,...,0,0,0,0,0,0,0,0,0,0
BPTP Spacio,0.0,0.0,0.00,0.0000,1000.0,1079.0,0.0000,0.00,1225.0,1865.0,...,0,0,0,0,0,0,0,0,0,0


In [231]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

ohe_df_normalized = pd.DataFrame(std.fit_transform(ohe_df), columns=ohe_df.columns, index=ohe_df.index)

In [232]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(ohe_df_normalized)

cosine_sim2.shape # seccond target

(246, 246)

In [233]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim2[ohe_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = ohe_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('M3M Golf Hills')

Unnamed: 0,PropertyName,SimilarityScore
0,AIPL The Peaceful Homes,0.955462
1,Smartworld One DXP,0.954670
2,Unitech Escape,0.953092
3,M3M Capital,0.951156
4,BPTP Terra,0.943128
...,...,...
240,Golden Park,-0.522391
241,Satya Merano Greens,-0.523660
242,ROF Normanton Park,-0.525129
243,BPTP Green Oaks,-0.525286


In [234]:
# list(enumerate(cosine_sim2[ohe_df_normalized.index.get_loc('M3M Golf Hills')]))

In [235]:
df[['PropertyName','LocationAdvantages']]['LocationAdvantages'][0]

"{'Bajghera Road': '800 Meter', 'Palam Vihar Halt': '2.5 KM', 'DPSG Palam Vihar': '3.1 KM', 'Park Hospital': '3.1 KM', 'Gurgaon Railway Station': '4.9 KM', 'The NorthCap University': '5.4 KM', 'Dwarka Expy': '1.2 KM', 'Hyatt Place Gurgaon Udyog Vihar': '7.7 KM', 'Dwarka Sector 21, Metro Station': '7.2 KM', 'Pacific D21 Mall': '7.4 KM', 'Indira Gandhi International Airport': '14.7 KM', 'Hamoni Golf Camp': '6.2 KM', 'Fun N Food Waterpark': '8.8 KM', 'Accenture DDC5': '9 KM'}"

In [236]:
def distance_to_meters(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

In [237]:
#3. extract distance for each location
import ast

location_matrix = {}
for index, row in df.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['LocationAdvantages']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

location_df.head()

Unnamed: 0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
0,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,,,,,,,,,,
25,550.0,,,,,6700.0,3800.0,,,7500.0,...,,,,,,,,,,
37,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,,,
69,1500.0,,,,6500.0,6700.0,5100.0,,,8200.0,...,,,,,,,,,,
9,,,,5500.0,,,,,,,...,,,,,,,,,,


In [238]:
location_df.columns

Index(['Bajghera Road', 'Palam Vihar Halt', 'DPSG Palam Vihar',
       'Park Hospital', 'Gurgaon Railway Station', 'The NorthCap University',
       'Dwarka Expy', 'Hyatt Place Gurgaon Udyog Vihar',
       'Dwarka Sector 21, Metro Station', 'Pacific D21 Mall',
       ...
       'MCC Cricket Ground Dhankot', 'The Shri Ram School Aravali',
       'Taj City Centre Gurugram', 'Minda Industries  Corporate Office',
       'Rampura Flyover, Naurangpur Rd', 'Manesar toll plaza - Kherki Daula',
       'Imt Manesar, Gurugram', 'Holiday Inn', 'Sector 84 Road',
       'Skyview Corporate Park'],
      dtype='object', length=1070)

In [239]:
# df.PropertyName

In [240]:
location_df.index = df.PropertyName

location_df.head()

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,,,,,,,,,,
M3M Crown,550.0,,,,,6700.0,3800.0,,,7500.0,...,,,,,,,,,,
Adani Brahma Samsara Vilasa,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,,,
Sobha City,1500.0,,,,6500.0,6700.0,5100.0,,,8200.0,...,,,,,,,,,,
Signature Global City 93,,,,5500.0,,,,,,,...,,,,,,,,,,


In [241]:
location_df.shape

(246, 1070)

In [242]:
location_df.fillna(54000,inplace=True)
location_df

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
M3M Crown,550.0,54000.0,54000.0,54000.0,54000.0,6700.0,3800.0,54000.0,54000.0,7500.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Adani Brahma Samsara Vilasa,5300.0,54000.0,54000.0,54000.0,2500.0,8800.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Sobha City,1500.0,54000.0,54000.0,54000.0,6500.0,6700.0,5100.0,54000.0,54000.0,8200.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Signature Global City 93,54000.0,54000.0,54000.0,5500.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Pyramid Urban Homes 2,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Satya The Hermitage,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
BPTP Spacio,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0


In [243]:
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns=location_df.columns, index=location_df.index)


location_df_normalized


Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-7.960979,-15.652476,-15.652476,-3.149592,-2.966108,-3.147217,-3.726615,-10.231739,-15.652476,-6.023233,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
M3M Crown,-7.998993,0.063888,0.063888,0.328277,0.368941,-3.054053,-3.529275,0.090308,0.063888,-6.009941,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Adani Brahma Samsara Vilasa,-7.276720,0.063888,0.063888,0.328277,-3.129124,-2.903557,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Sobha City,-7.854539,0.063888,0.063888,0.328277,-2.857430,-3.054053,-3.430606,0.090308,0.063888,-5.916893,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Signature Global City 93,0.128476,0.063888,0.063888,-2.985606,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Pyramid Urban Homes 2,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
Satya The Hermitage,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888
BPTP Spacio,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.280891,0.090308,0.063888,0.171073,...,0.0,0.063888,0.063888,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888


In [244]:
cosine_sim3 = cosine_similarity(location_df_normalized)

cosine_sim3.shape

(246, 246)

In [245]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    cosine_sim_matrix = 30*cosine_sim1 + 20*cosine_sim2 + 8*cosine_sim3
    # cosine_sim_matrix = cosine_sim3
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('Ireo Victory Valley')

Unnamed: 0,PropertyName,SimilarityScore
0,Pioneer Urban Presidia,28.021460
1,Ambience Creacions,27.787913
2,DLF The Crest,24.205986
3,Pioneer Araya,23.415308
4,Silverglades The Melia,21.007840
...,...,...
240,JMS The Nation,-14.735656
241,Shree Vardhman City,-14.888274
242,JMS Prime Land,-15.082229
243,Vatika Aspiration,-15.099679


In [246]:
(3*cosine_sim3 + 5*cosine_sim2 + 6*cosine_sim1).shape

(246, 246)

In [247]:
import pickle

In [248]:
pickle.dump(cosine_sim1, open('cosine_sim1','wb'))
pickle.dump(cosine_sim2, open('cosine_sim2','wb'))
pickle.dump(cosine_sim3, open('cosine_sim3','wb'))

In [249]:
(cosine_sim1,cosine_sim2, cosine_sim3)

(array([[1.        , 0.01095159, 0.        , ..., 0.01183329, 0.08656385,
         0.0110727 ],
        [0.01095159, 1.        , 0.01982121, ..., 0.11904241, 0.01555534,
         0.00963852],
        [0.        , 0.01982121, 1.        , ..., 0.07020502, 0.03820314,
         0.01962826],
        ...,
        [0.01183329, 0.11904241, 0.07020502, ..., 1.        , 0.09825738,
         0.03255851],
        [0.08656385, 0.01555534, 0.03820314, ..., 0.09825738, 1.        ,
         0.06257614],
        [0.0110727 , 0.00963852, 0.01962826, ..., 0.03255851, 0.06257614,
         1.        ]]),
 array([[ 1.        ,  0.13237477, -0.03199582, ...,  0.23938838,
          0.09359211,  0.3918892 ],
        [ 0.13237477,  1.        ,  0.08163866, ..., -0.28145036,
         -0.15433359, -0.13962499],
        [-0.03199582,  0.08163866,  1.        , ..., -0.23191291,
         -0.41377594, -0.32401234],
        ...,
        [ 0.23938838, -0.28145036, -0.23191291, ...,  1.        ,
          0.10863087,  0

In [250]:
pickle.dump(location_df, open('location_distance','wb'))