# Parse Data from WEB

In [1]:
from dataclasses import dataclass
from pathlib import Path
import sys
import json
from typing import List, Dict, Optional, Union

import httpx
import marshmallow_dataclass
from marshmallow import Schema
import pandas as pd

In [2]:
sys.path.append("../")

In [3]:
from schemas.advertising import AdvertisingSchema
from schemas.rubric import RubricFeatureSchema, RubricSchema
from schemas.user import UserSchema
from schemas.image import ImageSchema

## Rubrics

In [5]:
def load_json_from_web(url: str) -> Union[Dict, List]:
    """Load JSON from URL"""
    resp = httpx.get(url)
    resp.raise_for_status()
    data = resp.json()
    if type(data) == dict and "results" in data:
        data = data["results"]
    return data

In [6]:
def deserialize_json(data: Union[Dict, List], schema: Schema) -> List[Schema]:
    if type(data) == list:
        return [schema.load(i) for i in data]
    return [schema.load(data)]

In [7]:
def to_pandas(items: List[Schema]) -> pd.DataFrame:
    df = pd.DataFrame(items)
    return df

<hr>

In [None]:
URLS = [
    "https://www.bazaraki.com/api/items/rubrics/",
    "https://www.bazaraki.com/api/items/rubrics/19/",
    "https://www.bazaraki.com/api/items/rubrics/3527/",
]

df_rubrics = pd.DataFrame()
for url in URLS:
    # ETL pipeline:
    data = load_json_from_web(url=url)
    items = deserialize_json(data, schema=RubricSchema)
    df = to_pandas(items)
    
    # combine rubrics
    df_rubrics = pd.concat((df_rubrics, df), axis=0, ignore_index=True)


In [None]:
data = load_json_from_web(url="https://www.bazaraki.com/api/items/all_cities_districts/")

- Parse rubric features

In [7]:
df_rubrics = pd.read_parquet("../../airflow/data/raw/rubrics.parquet", engine="pyarrow")

In [8]:
# parent_ids: 19 and 3527
df_realty_rubrics = df_rubrics[
    df_rubrics.parent_id.isin([19, 3527])
][["id", "name", "slug", "path", "rubric_features"]]

In [39]:
df_realty_rubrics.drop("rubric_features", axis=1, inplace=True)

In [22]:
# realty rubrics:
df_realty_rubrics[["id", "name", "slug", "path"]]

Unnamed: 0,id,name,slug,path
15,678,Houses,houses,real-estate-for-sale/houses
16,3528,"Apartments, flats",apartments-flats,real-estate-for-sale/apartments-flats
17,2405,Commercial property,commercial-property,real-estate-for-sale/commercial-property
18,141,Plots of land,plots-of-land,real-estate-for-sale/plots-of-land
19,2790,Residential buildings,buildings,real-estate-for-sale/buildings
20,3303,Prefabricated houses,prefabricated-houses,real-estate-for-sale/prefabricated-houses
21,142,Other,other,real-estate-for-sale/other
22,681,Houses,houses,real-estate-to-rent/houses
23,3529,"Apartments, flats",apartments-flats,real-estate-to-rent/apartments-flats
24,2408,Commercial property,commercial-property,real-estate-to-rent/commercial-property


In [31]:
# get rubric features. Key: (rubric_id + feature_name)
df_realty_rubric_features = pd.DataFrame()
for i, row in df_realty_rubrics.iterrows():
    df_i = pd.DataFrame([RubricFeatureSchema.load(i) for i in row.rubric_features]).assign(
        rubric_id = row.id
    )
    df_realty_rubric_features = pd.concat(
        (df_realty_rubric_features, df_i),
        axis=0, ignore_index=True
    )

In [32]:
# process feature choises:
df_realty_rubric_features.feature_choices = (
    df_realty_rubric_features.feature_choices.apply(
        lambda x: {i["key"]: i["value"] for i in x}
    )
)

In [35]:
df_realty_rubric_features[
    df_realty_rubric_features.rubric_id == 3529
].head()

Unnamed: 0,feature_name,feature_verbose_name,feature_type,feature_type_id,feature_choices,filter_feature,required,measure_unit,rubric_id
90,attrs__type,Type,String choices,3,"{'5': 'Apartment', '8': 'Penthouse'}",True,True,,3529
91,attrs__number-of-bedrooms,Bedrooms,String choices,3,"{'0': 'Studio', '1': '1', '2': '2', '3': '3', ...",True,True,,3529
92,attrs__number-of-bathrooms,Bathrooms,Integer choices,4,"{'1': '1', '2': '2', '3': '3', '4': '4', '5': ...",False,False,,3529
93,attrs__area,Property area,Integer,0,{},True,True,m²,3529
94,attrs__floor,Floor,Integer choices,4,"{'10': 'Ground floor', '20': '1st', '30': '2nd...",False,False,,3529


## Parse Ads

In [5]:
# load rubrics & features alreay processed:
df_realty_rubrics = pd.read_parquet("../data/processed/realty_rubrics.parquet")

Expected new DataFrames:
- df_ads
- df_users
- df_images
- df_attributes - TODO!
- df_image_to_ad

In [31]:
%%time

# Load all ads:
df_ads = pd.DataFrame()
for rubric_id in df_realty_rubrics.id.unique():
    path_i = Path(f"../data/raw/ads-full/rubric_{rubric_id}.parquet")
    if path_i.exists():
        df_i = pd.read_parquet(path_i)
        df_ads = pd.concat((df_ads, df_i), ignore_index=True, axis=0)

CPU times: user 275 ms, sys: 51.1 ms, total: 326 ms
Wall time: 267 ms


- Parse users

In [33]:
%%time

user_list = df_ads.user \
.apply(lambda x: UserSchema.load(x)) \
.drop_duplicates() \
.to_list()

df_users = pd.DataFrame(user_list)

CPU times: user 1.56 s, sys: 12.7 ms, total: 1.57 s
Wall time: 1.58 s


In [34]:
# process users:
df_users = df_users.rename({
    "id": "user_id",
    "joined": "registration_date",
}, axis=1)

df_users["registration_date"] = pd.to_datetime(df_users["registration_date"])

- Parse Images

In [35]:
%%time

image_list = df_ads.images \
.apply(lambda x: [ImageSchema.load(i) for i in x]) \
.apply(pd.Series) \
.stack() \
.to_list()

df_images = pd.DataFrame(image_list) \
.drop_duplicates() \
.reset_index(drop=True)

df_images = df_images.rename({
    "id": "image_id",
    "url": "compressed_url",
    "orig": "origiral_url",
}, axis=1)



CPU times: user 2.98 s, sys: 48.9 ms, total: 3.03 s
Wall time: 3.05 s


- Image-to-Ad mapping

In [37]:
%%time

image_to_ad_list = df_ads \
.apply(
    lambda x: [[i["id"], x["id"]] for i in x.images],
    axis=1
) \
.apply(pd.Series) \
.stack() \
.to_list()


df_image_to_ad = pd.DataFrame(
    image_to_ad_list,
    columns=["image_id", "advertisement_id"]
)



CPU times: user 1.33 s, sys: 24.4 ms, total: 1.35 s
Wall time: 1.35 s


- Attributes

TODO: ...

- Clean Ads

In [38]:
# rename some columns
df_ads = df_ads.rename({
    "id": "advertisement_id",
    "rubric": "rubric_id",
    "city": "city_id"
}, axis=1)

# add user_id link:
df_ads["user_id"] = df_ads.user.apply(lambda x: x.get("id"))

# parse district_id
df_ads["district_id"] = df_ads.city_districts.apply(
    lambda x: x[0] if len(x) > 0 else None
)

# drop unused columns:
df_ads.drop(["city_districts", "user", "images", "attrs"], axis=1, inplace=True)

<hr>

In [25]:
# final artefacts:
df_users; df_images; df_ads; df_image_to_ad;

In [26]:
print("Users:", df_users.shape)
print("Images:", df_images.shape)
print("Ads:", df_ads.shape)
print("df_image_to_ad:", df_image_to_ad.shape)

Users: (3213, 7)
Images: (148122, 4)
Ads: (19748, 40)
df_image_to_ad: (148122, 2)


## Cities & Districts

In [13]:
# cities
df_cities = pd.read_parquet("../data/raw/cities.parquet", engine="pyarrow")
df_cities.rename({"id": "city_id"}, axis=1, inplace=True)
df_cities

Unnamed: 0,name,city_id,slug,coordinates
0,Famagusta,8,ammochostos-district,"{'latitude': 35.032496, 'longitude': 33.903682}"
1,Nicosia,11,lefkosia-district-nicosia,"{'latitude': 35.183955, 'longitude': 33.377947}"
2,Limassol,12,lemesos-district-limassol,"{'latitude': 34.706962, 'longitude': 33.022328}"
3,Larnaca,10,larnaka-district-larnaca,"{'latitude': 34.902932, 'longitude': 33.626676}"
4,Paphos,13,pafos-district-paphos,"{'latitude': 34.773899, 'longitude': 32.430527}"


In [12]:
# districts
df_districts = pd.read_parquet("../data/raw/districts.parquet", engine="pyarrow")
df_districts["city_id"] = df_districts.city.apply(lambda x: x.get("id")).astype(int)
df_districts.rename({"id": "city_id"}, axis=1, inplace=True)
df_districts.drop("city", axis=1, inplace=True)
df_districts

Unnamed: 0,name,city_id,slug,post_codes,coordinates,city_id.1
0,Achna,5756,achna,[5500],"{'latitude': 35.0388109, 'longitude': 33.7910458}",8
1,Agia Napa,5721,agia-napa,[8041],"{'latitude': 34.9856395, 'longitude': 33.9749189}",8
2,Agia Thekla,5722,agia-thekla,[5391],"{'latitude': 34.9790498, 'longitude': 33.9251563}",8
3,Agia Triada,5720,agia-triada,[],"{'latitude': 35.0501584, 'longitude': 34.0020218}",8
4,Agia Zoni,5760,agia-zoni,[],"{'latitude': 35.1082101, 'longitude': 33.949893}",8
...,...,...,...,...,...,...
631,Trimithousa,5158,trimithousa,[8813],"{'latitude': 34.975291, 'longitude': 32.487057}",13
632,Tsada,5081,tsada,[8540],"{'latitude': 34.84684, 'longitude': 32.459579}",13
633,Venus Rock Kouklia,5835,venus-rock-kouklia,[],"{'latitude': 34.6888, 'longitude': 32.5849}",13
634,Vretsia,5080,vretsia,[8644],"{'latitude': 34.88345, 'longitude': 32.662498}",13


## Parse ads features

In [152]:
def process_attribute_column(
    series: pd.Series,
    col_name: str,
    rubric_features_dict: Dict,
) -> pd.Series:
    """Process features based on bazaraki Enums"""
    if col_name not in rubric_features_dict.index:
        return series
        
    if rubric_features_dict.loc[col_name].feature_type in ("Integer", "String"):
        return series
    
    if rubric_features_dict.loc[col_name].feature_type in ("Integer choices", "String choices"):
        return series.apply(lambda x: rubric_features_dict.loc[col_name].feature_choices.get(str(x)))
    
    else:
        # TODO Implementation...
        return series
        raise NotImplementedError(
            f"Not implemented parsing for column type: {rubric_features_dict.loc[col_name].feature_type}"
        )

In [153]:
DATA_DIR = Path("../../airflow/data")

In [154]:
df_rubrics = pd.read_parquet(Path(DATA_DIR, "processed/rubrics.parquet"))
df_rubric_features = pd.read_parquet(Path(DATA_DIR, "processed/rubric_features.parquet"))

In [232]:
df_rubrics["id"].values

array([ 678, 3528, 2405,  141, 2790, 3303,  142,  681, 3529, 2408, 3530,
       2191,  434, 3531])

In [201]:
rubric_id = 142

In [213]:
_

In [216]:
with open(f"../../airflow/data/raw/ads-incremental/{rubric_id}/page_1.json", "r") as input_stream:
    ads = json.load(input_stream)

In [230]:
# load attributes
attributes = {i["id"]: i["attrs"] for i in ads["results"]}

In [207]:
# proprocessing:
df_attributes = pd.DataFrame(attributes).T

for col_name in df_attributes.columns:
    
    df_attributes[col_name] = process_attribute_column(
        series=df_attributes[col_name],
        col_name=col_name,
        rubric_features_dict=rubric_features_dict,
    )
    
    df_attributes.rename(
        {col_name: col_name.replace("attrs__", "").replace("-", "_")},
        axis=1,
        inplace=True
    )
    
    
# final processing:
df_attributes.reset_index(names=["advertisement_id"], inplace=True)
df_attributes["rubric_id"] = rubric_id