In [19]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df = pd.read_csv('/app/data/suumo_geolocation.csv')

print(f"Total unique addresses: {len(df['converted_address'].unique())}")

df.head()

Total unique addresses: 2941


Unnamed: 0.1,Unnamed: 0,adress,URL,Line_1,Station_1,Minutes_1,Line_2,Station_2,Minutes_2,Line_3,...,is_free_rent,is_accessible_by_2_railway_lines,is_accessible_by_3+_railway_lines,is_within_5_minutes_walk_to_station,is_within_10_minutes_walk_to_station,is_top_floor,is_corner_unit,converted_address,latitude,longitude
0,0,東京都目黒区東山１,https://suumo.jp/chintai/jnc_000090405876/?bc=...,,,,,,,,...,0,0,0,0,0,0,0,東京都目黒区東山一丁目,35.646509,139.692993
1,1,東京都世田谷区上野毛１,https://suumo.jp/chintai/jnc_000091350228/?bc=...,東急大井町線,上野毛駅,6.0,東急大井町線,等々力駅,15.0,東急田園都市線,...,0,0,0,0,0,0,0,東京都世田谷区上野毛一丁目,35.61121,139.639622
2,2,東京都世田谷区鎌田１,https://suumo.jp/chintai/jnc_000091171055/?bc=...,,,,,,,,...,0,1,0,0,0,1,1,東京都世田谷区鎌田一丁目,35.615537,139.617052
3,3,東京都世田谷区鎌田１,https://suumo.jp/chintai/jnc_000091186431/?bc=...,,,,,,,,...,0,0,0,0,0,1,1,東京都世田谷区鎌田一丁目,35.615537,139.617052
4,4,東京都世田谷区鎌田１,https://suumo.jp/chintai/jnc_000091177744/?bc=...,,,,,,,,...,0,0,0,0,0,1,1,東京都世田谷区鎌田一丁目,35.615537,139.617052


In [20]:
nan_counts = df.isna().sum()
cols_with_count = nan_counts[nan_counts > 0].index
for col in cols_with_count:
    print(f"{col}: {nan_counts[col]}")

Line_1: 88240
Station_1: 88240
Minutes_1: 88203
Line_2: 89522
Station_2: 89522
Minutes_2: 89136
Line_3: 92822
Station_3: 92822
Minutes_3: 91581


In [21]:
df.drop(["Unnamed: 0",'adress', "converted_address", 'URL', 'ward', "Line_2", "Station_2", "Minutes_2", "Line_3", "Station_3", "Minutes_3"], axis=1, inplace=True)

df['Minutes_1'] = df['Minutes_1'].fillna(-1)
df = pd.get_dummies(df, columns=['Station_1'], dummy_na=True) # dummy_na=True so will include column for NaN values
df = pd.get_dummies(df, columns=['Line_1'], dummy_na=True) # dummy_na=True so will include column for NaN values

- Cluster wards
- PCA, or recursive feature elimination

In [22]:
X = df.drop(["rent", "deposit", "key_money", "management_fee"], axis=1)
y_rent = df["rent"]
y_deposit = df["deposit"]
y_key_money = df["key_money"]
y_management_fee = df["management_fee"]

# Create a random forest regressor with default parameters
rf = RandomForestRegressor()

# Fit the model
rf.fit(X, y_rent)

# Print the feature importances
feature_importances = rf.feature_importances_
# Sort the feature importances in descending order
sorted_importances = sorted(zip(X.columns, feature_importances), key=lambda x: x[1], reverse=True)

# List out the feature names and importances
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")


area: 0.4985911260065285
latitude: 0.28347984691510975
year: 0.04798920046032018
longitude: 0.03803220800371992
no_floors: 0.0310755566632668
Minutes_1: 0.021714092355978355
Station_1_目黒駅: 0.012974671622586788
no_rooms: 0.006534294215511407
floor: 0.005828238421694729
layout_3LDK: 0.005342487573790008
construction_method_other: 0.004691534299214651
building_type_mansion: 0.0045926108214708085
is_no_guarantor_required: 0.002821545759932328
has_security_camera: 0.0019215787198351559
has_auto_lock: 0.0018509981749420606
has_elevator: 0.0016964612175996705
layout_2LDK: 0.0011410558814376676
is_pet_ok: 0.0008912651027275355
building_type_apartment: 0.0008760536963058177
is_newly_built: 0.0008611785194682185
is_condo_rental: 0.0008297452169469703
has_floor_heating: 0.0007947418283725894
construction_method_RC: 0.0007659222021958599
layout_1LDK: 0.0007119604076231381
has_bathroom_dryer: 0.0006639856800889658
has_system_kitchen: 0.0005740290895061124
is_immediate_move_in: 0.0005646977624831853

In [None]:
# Extract the longitude and latitude columns from the dataframe
longitude = df['longitude']
latitude = df['latitude']

# Create a list to store the sum of squared distances for each number of clusters
sse = []

# Try different numbers of clusters
for k in range(1, 11):
    # Create a KMeans instance with k clusters
    kmeans = KMeans(n_clusters=k, random_state=0)
    
    # Fit the KMeans model to the longitude and latitude data
    kmeans.fit(df[['longitude', 'latitude']])
    
    # Calculate the sum of squared distances for the current number of clusters
    sse.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow Method')
plt.show()