# Simulate Webapp
- In 04_select_model.ipynb, a XGBRegressor model was trained and saved using summo and Tokyo geolocation data.
- This will be deployed to a webapp. 
- The webapp will accept a used apartment property listing URL from suumo.jp.
- In Django, this URL will be scraped for the model input data.
- The data will be sent to the server running the model and a predicted price will be returned.

In [85]:
import joblib
import pandas as pd

In [86]:
df = pd.read_csv('/app/data/sales_combo_property_geo.csv')

df.drop(["Category","URL","plan","address","features","address_block","converted_address"], axis=1, inplace=True)
df['balcony_area'] = df['balcony_area'].fillna(0)
df = df.dropna()

df.head()

Unnamed: 0,price,area,balcony_area,level,no_floors,year,rooms,living_room,dining_room,kitchen,storage,latitude,longitude
0,11900000.0,41.34,2.43,2.0,4.0,1979,2,False,True,True,0,35.807181,139.808077
1,13800000.0,16.45,2.69,5.0,9.0,1991,1,False,False,True,0,35.716056,139.787213
2,29800000.0,43.13,4.58,9.0,12.0,1975,2,False,True,True,0,35.725685,139.787644
3,34900000.0,64.96,7.53,4.0,14.0,1984,3,True,True,True,0,35.678826,139.822231
4,41800000.0,54.13,8.92,7.0,10.0,2011,2,True,True,True,0,35.731596,139.816982


In [87]:
X = df.drop(["price"], axis=1)
y = df["price"]

In [88]:
random_index = X.sample(n=1, random_state=42).index
random_row_X = X.loc[random_index]
random_row_y = y.loc[random_index]

random_row_X

Unnamed: 0,area,balcony_area,level,no_floors,year,rooms,living_room,dining_room,kitchen,storage,latitude,longitude
16340,57.88,9.0,7.0,7.0,1996,2,True,True,True,0,35.72274,139.781199


In [89]:
# # Load a model
model = joblib.load('/app/models/model_sales_price.pkl')

prediction = model.predict(random_row_X)

print(f"Prediction: {prediction[0]:,.0f}. True: {int(random_row_y.iloc[0]):,.0f}")


Prediction: 54,039,208. True: 54,800,000


In [90]:
from retry import retry # automatic retry
import requests
from bs4 import BeautifulSoup # parse HTML
import pandas as pd
import datetime
import re


In [91]:
@retry(tries=3, delay=10, backoff=2)
def get_html(url):
    """
    Uses BeautifulSoup to parse HTML content from url. Retries 3 times with 10 second delay that doubles each try.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None

In [92]:
def parse_price(value):
    try:
        total_price = 0
        price_str = value.strip().split('円')[0]
        # 1億円
        if "億" in price_str:
            oku_pattern = re.compile(r'(\d+)億')       # match int number before '億'
            oku_match = oku_pattern.search(price_str)
            if oku_match:
                total_price += int(oku_match.group(1)) * 100000000
        if "万" in price_str:
            man_pattern = re.compile(r'(\d+)万')
            man_match = man_pattern.search(price_str)
            if man_match:
                total_price += int(man_match.group(1)) * 10000
        if total_price:
            return total_price
        else:
            return int(price_str)
    except ValueError:
        return None

def parse_plan(df):
    # Normalize plans by removing duplicate consecutive numbers
    df['plan'] = df['plan'].apply(lambda x: re.sub(r'(\d)\1+', r'\1', x))

    # Normalize and extract room data
    df['rooms'] = df['plan'].str.extract(r'^(\d)').fillna(1).astype(int)  # Defaults to 1 if no number is present

    # Boolean columns for L, D, K
    df['living_room'] = df['plan'].str.contains('L')
    df['dining_room'] = df['plan'].str.contains('D')
    df['kitchen'] = df['plan'].str.contains('K')

    # Count storage rooms
    df['storage'] = df['plan'].str.extractall(r'(\+S|\+.*?S（納戸\）)').groupby(level=0).size().reindex(df.index, fill_value=0)

    # Alternative handling for studios
    df.loc[df['plan'].str.contains('ワンルーム'), 'rooms'] = 1
    df.loc[df['plan'].str.contains('ワンルーム'), ['living_room', 'dining_room', 'kitchen']] = False

    return df

def parse_area(value):
    try:
        area_pattern = re.compile(r'(\d+(\.\d+)?)m')   # match decimal number before 'm'
        area_match = area_pattern.search(value)
        if area_match:
            return float(area_match.group(1))
        else:
            return 0
    except ValueError:
        return None

def extract_number(value, i):
    try:
        level_details = value.strip().split('/')
        if len(level_details) > 1:
            match = re.search(r'\d+', level_details[i])
            if match:
                return int(match.group())
        else: 
            return None
    except ValueError:
        return None

def parse_year(value):
    try:
        return int(value.strip().split('年')[0])
    except ValueError:
        return None

def remove_whitespace(text):
    """ Remove white space characters suchs as newline, tab, etc. """
    return re.sub(r'\s+', ' ', text).strip()

def parse_features(soup):
    """ Get soup of data in the 特徴ピックアップ section. """
    features_section = soup.find('h3', string='特徴ピックアップ')
    if features_section:
        parent_div = features_section.find_parent('div', class_='secTitleOuterR')
        if parent_div == None:
            parent_div = features_section.find_parent('div', class_='secTitleOuterK')
        features_div = parent_div.find_next_sibling('div', class_='mt10')
        if features_div:
            features_text = ' '.join(features_div.stripped_strings)
            return remove_whitespace(features_text)
    return None

def arabic_to_kanji(num):
    kanji_numerals = ["", "一", "二", "三", "四", "五", "六", "七", "八", "九"]
    digits = ["", "十", "百", "千"]
    kanji_string = ""
    num_str = str(num)
    position = 0

    for digit in reversed(num_str):
        if int(digit) > 0:
            kanji_string = kanji_numerals[int(digit)] + digits[position] + kanji_string
        position += 1

    # Special case handling for numbers like 10, 20, 30, etc.
    kanji_string = kanji_string.replace("一十", "十")
    
    return kanji_string

def convert_address_format(address):
    import re
    # Find all numbers in the address
    numbers = re.findall(r'\d+', address)
    
    for number in numbers:
        kanji_number = arabic_to_kanji(int(number)) + '丁目'
        # Replace the original number with its kanji representation
        address = address.replace(number, kanji_number, 1)
    
    return address

def parse_address(df):
    df.rename(columns={'address': 'original_address'}, inplace=True)
    df['address_block'] = df['original_address'].str.extract(r'(.*?\d+)', expand=False)  # Extracting up to (and including) the first number
    df['address_block'] = df['address_block'].fillna(df['original_address']) # .extract returns NaN if no match, fill NaN with original address
    df['address'] = df['address_block'].apply(convert_address_format)
    df.drop(["address_block"], axis=1, inplace=True)
    df['address'] = df['address'].str.strip()
    df['address'] = df['address'].str.replace('ヶ', 'ケ')
    return df

def address_to_long_lat(df):
    file_path = '/app/data/address_long_lat.csv'
    df_geo = pd.read_csv(file_path)
    df_results = pd.merge(df, df_geo, left_on='address', right_on='address', how='left')

    # Check if latitude is null
    if pd.isna(df_results.at[0, 'latitude']):
        # Extract part of the address up to the first number (correcting the regular expression usage)
        temp_address = df_results.at[0, 'oroginal_address'].extract(r'(.*?)(?=\d+)', expand=False)
        # Find a match in the geo DataFrame
        match = df_geo[df_geo['address'].str.contains(temp_address, na=False)]
        if not match.empty:
            df_results.at[0, 'latitude'] = match.iloc[0]['latitude']
            df_results.at[0, 'longitude'] = match.iloc[0]['longitude']
        
    return df_results

In [93]:
def scrape_url(url):
    soup = get_html(url)

    property_data = {}
    property_data['url'] = url

    # Iterate over each row in the table
    target_table = soup.find('table', summary="表")
    for row in target_table.find_all('tr'):
        # Find all th and td pairs in the row
        ths = row.find_all('th')
        tds = row.find_all('td')
        
        for th, td in zip(ths, tds):
            header = th.text.strip()
            value = td.text.strip()

            if "価格" in header:
                property_data['price'] = parse_price(value)
            elif "間取り" in header:
                property_data['plan'] = value.strip()
            elif "専有面積" in header:
                property_data['area'] = parse_area(value)
            elif "その他面積" in header:
                property_data['balcony_area'] = parse_area(value)
            elif "所在階/構造・階建" in header:
                property_data['level'] = extract_number(value, 0)
                property_data['no_floors'] = extract_number(value, 1)
            elif "完成時期（築年月）" in header:
                property_data['year'] = parse_year(value)
            elif "住所" in header:
                property_data['address'] = value.strip().split('\n')[0]

    property_data['features'] = parse_features(soup)

    df_data = pd.DataFrame([property_data])

    df_data = parse_plan(df_data) # eg 2LDK to 4 cols, e.g. rooms:2, living_room:1, dining_room:1, kitchen:1
    # df_data = df_data.drop(columns=["plan"])

    df_data = parse_address(df_data) # format address, e.g. 東京都江東区南砂１ to 東京都江東区南砂一丁目

    return df_data

df_data = scrape_url("https://suumo.jp/ms/chuko/tokyo/sc_shinjuku/nc_75306094/")
df_data = address_to_long_lat(df_data)
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y%m%d_%H%M%S") # YYYYMMDD_HHMMSS
df_data.to_csv(f'/app/data/sales_property_data_{formatted_time}.csv', index=False)
df_data.head()

Unnamed: 0,url,price,plan,area,balcony_area,level,no_floors,year,original_address,features,rooms,living_room,dining_room,kitchen,storage,address,longitude,latitude
0,https://suumo.jp/ms/chuko/tokyo/sc_shinjuku/nc...,19800000,1LDK,29.59,3.56,2,4,1979,東京都新宿区高田馬場４,即引渡可 / ２沿線以上利用可 / スーパー 徒歩10分以内,1,True,True,True,0,東京都新宿区高田馬場四丁目,139.699185,35.710764


In [94]:
# df_data_test = df_data.copy()
df_data_test = pd.read_csv('/app/data/sales_property_data_20240723_113737.csv')
df_data_test.drop(["url", "features", "original_address", "plan", "address"], axis=1, inplace=True)
df_data_test.head()

Unnamed: 0,price,area,balcony_area,level,no_floors,year,rooms,living_room,dining_room,kitchen,storage,longitude,latitude
0,11900000,41.34,2.43,2,4,1979,2,False,True,True,0,139.808077,35.807181


In [95]:
df_data_test.isna().sum().to_frame(name='NaNs').T

Unnamed: 0,price,area,balcony_area,level,no_floors,year,rooms,living_room,dining_room,kitchen,storage,longitude,latitude
NaNs,0,0,0,0,0,0,0,0,0,0,0,0,0


In [96]:
X_test = df_data_test.drop(["price"], axis=1)
y_test = df_data_test["price"]

random_index = X_test.sample(n=1, random_state=42).index

random_row_X = X_test.loc[random_index]
random_row_y = y_test.loc[random_index]

# for col in random_row_X.columns:
#     if random_row_X[col].dtype == 'object':
#         random_row_X[col] = random_row_X[col].astype('category')

In [97]:
# # Load a model
model = joblib.load('/app/models/model_sales_price.pkl')

# Must make sure order of features is the same as during training
expected_feature_order = ['area', 'balcony_area', 'level', 'no_floors', 'year', 'rooms', 
                          'living_room', 'dining_room', 'kitchen', 'storage', 'latitude', 'longitude']
random_row_X = random_row_X[expected_feature_order]

prediction = model.predict(random_row_X)

print(f"Prediction: {prediction[0]:,.0f}. True: {int(random_row_y[0]):,.0f}")

Prediction: 7,920,905. True: 11,900,000


In [98]:
df.head()

Unnamed: 0,price,area,balcony_area,level,no_floors,year,rooms,living_room,dining_room,kitchen,storage,latitude,longitude
0,11900000.0,41.34,2.43,2.0,4.0,1979,2,False,True,True,0,35.807181,139.808077
1,13800000.0,16.45,2.69,5.0,9.0,1991,1,False,False,True,0,35.716056,139.787213
2,29800000.0,43.13,4.58,9.0,12.0,1975,2,False,True,True,0,35.725685,139.787644
3,34900000.0,64.96,7.53,4.0,14.0,1984,3,True,True,True,0,35.678826,139.822231
4,41800000.0,54.13,8.92,7.0,10.0,2011,2,True,True,True,0,35.731596,139.816982
