In [24]:
import requests
from google.cloud import bigquery as bq
from bs4 import BeautifulSoup
from google.cloud import storage
from time import sleep
from random import randint
from datetime import datetime
import re
import pandas as pd
import os

In [33]:
def explicit():
    from google.cloud import storage

    # Explicitly use service account credentials by specifying the private key
    # file.
    storage_client = storage.Client.from_service_account_json(
        '/Users/felipedemenechvasconcelos/keys/scenic-edition-310913-26647dbaf7a5.json')

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

In [34]:
explicit()

[<Bucket: de_rent_bkt>]


## Teste cloud storage connection - get number of offers.

In [37]:
def get_offers_qtt(save=True):
    '''
    Get offers quantity by city and store it in a postgres DB.
    '''
    
    cities_dict = {
        'Dusseldorf': 100207,
        'Berlin': 87372,
        'Essen': 102157,
        'Munchen': 121673,
        'Koln': 113144,
        'Stuttgart': 143262,
        'Dresden': 100051,
        'Hannover': 109489,
        'Dortmund': 99990,
        'Frankfurt am Main': 105043,
        'Hamburg': 109447
    }
    now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    now2 = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    # get all offers quantity - haus und wohnung

    cities_offers = []

    for city, code in cities_dict.items():
        
        #logging.info(f'Getting offers in {city}...')
        print(f'Getting offers in {city}...')

        total_offers = 0

        for i in range(1,3):
            url = f'https://www.immonet.de/immobiliensuche/sel.do?&sortby=0&suchart=1&objecttype=1&marketingtype=2&parentcat={i}&city={code}'
            #url = f'https://www.immonet.de/immobiliensuche/sel.do?parentcat={i}&objecttype=1&pageoffset=378&listsize=27&suchart=1&sortby=0&city={code}&marketingtype=2&page=1'
            headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
            page = requests.get(url, headers=headers)
            soup = BeautifulSoup(page.text, "html.parser")

            # Find number of rent offers.
            a = soup.select('ul', class_='tbl margin-auto margin-top-0 margin-bottom-0 padding-0')
            
            for i in a:
                if 'Alle Orte' in i.text:
                    c = i.text
                    total_offers += int(re.findall('\d+', c)[0])
            #total_offers += int(re.search('\d+', a).group())

        cities_offers.append({'extraction_datetime': now, 'city': city, 'city_code': code, 'offers': total_offers})

    # offers_by_page = len(soup.findAll('div', class_="col-xs-12 place-over-understitial sel-bg-gray-lighter"))    
    df_offers = pd.DataFrame(cities_offers)
    #df_offers.to_csv(f'temp_data/total_offers_by_city_temp_file.csv', index=False)
    if save:
        #if not os.path.exists('../data'):
            #os.makedirs('../data')
        df_offers.to_csv('../data/offers_qtt_by_city.csv', index=False)

    return None


def upload_blob():
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    bucket_name = "de_rent_bkt"
    # The path to your file to upload
    source_file_name = "../data/offers_qtt_by_city.csv"
    # The ID of your GCS object
    destination_blob_name = "de_rent_data/de_rent_data"
    
    storage_client = storage.Client.from_service_account_json(
        '/Users/felipedemenechvasconcelos/keys/scenic-edition-310913-26647dbaf7a5.json')
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")
    
def main():
    get_offers_qtt()
    upload_blob()
    
if __name__ == '__main__':
    main()

Getting offers in Dusseldorf...
Getting offers in Berlin...
Getting offers in Essen...
Getting offers in Munchen...
Getting offers in Koln...
Getting offers in Stuttgart...
Getting offers in Dresden...
Getting offers in Hannover...
Getting offers in Dortmund...
Getting offers in Frankfurt am Main...
Getting offers in Hamburg...
File ../data/offers_qtt_by_city.csv uploaded to de_rent_data.


## Teste cloud storage connection - get all offer ids.

In [43]:
'''Get the ids from all rent offers in all determinated cities.

This script gets all the ids from all rent offers in all the predeterminated cities
and save it in a database table named "all_offer_ids" to further use.

The previous table is dropped and a new table is created each time that it runs.
'''

import os
import re
import sys
import math
import requests
import psycopg2
import logging
import numpy as np
import pandas as pd

from time import sleep
from random import randint
from bs4 import BeautifulSoup
from datetime import datetime
import psycopg2.extras as extras
from google.cloud import storage
from sqlalchemy import create_engine

# Create log folder if not exists
if not os.path.exists('Logs'):
    os.makedirs('Logs')
    
logging.basicConfig(
    filename='Logs/get_offer_ids.txt',
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m_%d %H:%M:%S',
    level=logging.DEBUG
)

logger = logging.getLogger('get_offer_ids')


def get_offer_ids(df, save=True):
    '''Get all offer ids for each offer in each city
    
    Parameters:
    -----------
        df: a dataframe with the number of offers in each city.
        
        save: default=False
            save the returned dataframe locally or not.
            
    Return:
    -------
        Return a dataframe with all offer ids for each city with the type of the offer (wohnung/haus).
    '''    
    
    ids_list = []
    offers_by_page = 26

    now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    for c in range(len(df)):
        city = df.loc[c]['city']
        city_code = df.loc[c]['city_code']
        offers = df.loc[c]['offers']
        
        print(f'Getting offer ids for {city}...')
        logger.info(f'Getting offer ids for {city}...')

        # get all offers ids for haus und wohnung in each city


        # Get the number of pages to scrape - rounded to down
        number_of_pages = math.floor(offers / offers_by_page)

        # wohnung/haus code
        l_opt = [1, 2]

        for opt in l_opt:
            for page in range(number_of_pages):
                url = f"https://www.immonet.de/immobiliensuche/sel.do?parentcat={opt}&objecttype=1&pageoffset=1&listsize=26&suchart=1&sortby=0&city={city_code}&marketingtype=2&page={page}"
                headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
                page = requests.get(url, headers=headers)
                soup = BeautifulSoup(page.text, "html.parser")

                offers_list_1page = soup.findAll('div', class_="col-xs-12 place-over-understitial sel-bg-gray-lighter")
                
                for i in range(len(offers_list_1page)):
                    try:
                        if opt == 1:
                            ids_list.append({'extraction_datetime': now, 'offer_id': offers_list_1page[i].find('a')['data-object-id'], 'city': city, 'city_code': city_code, 'type': 'wohnung'})
                        if opt == 2:
                            ids_list.append({'extraction_datetime': now, 'offer_id': offers_list_1page[i].find('a')['data-object-id'], 'city': city, 'city_code': city_code, 'type': 'haus'})
                    except:
                        logger.error(f'Error - id:{i}')
                        pass
                sleep(randint(1, 2))         
        sleep(randint(1, 5))          

    # Create a dataframe with the infos
    df_ids = pd.DataFrame(ids_list)
    df_ids.drop_duplicates(subset='offer_id', inplace=True)

    # save as csv file
    if save:
        if not os.path.exists('../data'):
            os.makedirs('../data')
        output_dir = '../data/'
        #now2 = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
        filename = f'all_offers_ids.csv'
        df_ids.to_csv(os.path.join(output_dir, filename), index=False)
    
    return None

def upload_blob():
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    bucket_name = "de_rent_bkt"
    # The path to your file to upload
    source_file_name = "../data/all_offers_ids.csv"
    # The ID of your GCS object
    destination_blob_name = "de_rent_data/all_offers_ids.csv"
    
    storage_client = storage.Client.from_service_account_json(
        '/Users/felipedemenechvasconcelos/keys/scenic-edition-310913-26647dbaf7a5.json')
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")
    
def main():
    # connection to database
    df = pd.read_csv('../data/offers_qtt_by_city.csv')
    
    get_offer_ids(df)
    upload_blob()
    
if __name__=='__main__':
    main()

Getting offer ids for Dusseldorf...
Getting offer ids for Berlin...
Getting offer ids for Essen...
Getting offer ids for Munchen...
Getting offer ids for Koln...
Getting offer ids for Stuttgart...
Getting offer ids for Dresden...
Getting offer ids for Hannover...
Getting offer ids for Dortmund...
Getting offer ids for Frankfurt am Main...
Getting offer ids for Hamburg...
File ../data/all_offers_ids.csv uploaded to de_rent_data/all_offers_ids.csv.


## Get all infos, preprocess it and load into cloud storage

In [46]:
'''Get the raw data from the DB, clean and organize it.

This script gets the raw dataset with all rent offers in the predefinated cities
in Germany, separate the meaningful information, clean it and organize it in 
different columns.

Returns a new dataframe read to be used.
'''
    
# imports
import re
import os
import logging
import numpy as np
import pandas as pd
from google.cloud import storage

# set log folder, files and object configs
if not os.path.exists('Logs'):
    os.makedirs('Logs')
    
logging.basicConfig(
    filename='Logs/offers_infos_cleaner.txt',
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m_%d %H:%M:%S',
    level=logging.DEBUG
)

logger = logging.getLogger('offers_infos_cleaner')


def offers_infos_preprocess(df_raw, save=True):
    '''Clean and separate meaningful infos
    
    Parameter:
    ----------
        df_raw: Dataframe to be cleaned
        
    Return:
    -------
        Returns a new dataframe with the meaningful informations separated by columns
        and cleaned.    
    '''
    # Separate into latitude (lat) and longitude (lng)
    df_raw['lat'] = df_raw['lat_lng'].apply(lambda x: re.findall('\d+.\d+', x)[0])
    df_raw['lng'] = df_raw['lat_lng'].apply(lambda x: re.findall('\d+.\d+', x)[1])

    # drop original lat_lng column
    df_raw.drop(columns='lat_lng', inplace=True)

    df_list = []

    for x in range(len(df_raw)):
        infos_dict = {}  
        
        # get infos from df_raw
        infos_dict['offer_id'] = df_raw['offer_id'][x]
        infos_dict['extraction_date'] = df_raw['extraction_date'][x]
        infos_dict['lat'] = df_raw['lat'][x]
        infos_dict['lng'] = df_raw['lng'][x]

        # preprocess the infos cell
        b = df_raw['offer_infos'][x].replace('\\', '')
        b = b.replace('{', '').replace('}', '')[4:]
        b = b[:-1]
        b = b.split(',')
        
        # get all meaningful infos and return it cleane and
        # separated by columns.
        for i in b:
            # offer area
            if 'area' in i:
                try:
                    i = i.replace('"', '').replace("'", "").replace('area:', '').replace(' ', '')
                    infos_dict['area_m2'] = float(i)
                except:
                    infos_dict['area_m2'] = np.nan
                    logger.debug(f'Offer {i} has no information about area.')
            # if the offer is furnished or not
            if 'mobex' in i:
                if 'true' in i:
                    infos_dict['furnished'] = 1
                elif 'false' in i:
                    infos_dict['furnished'] = 0
                else:
                    infos_dict['furnished'] = np.nan
                    logger.debug(f'Offer {i} has no information about furniture.')
            #else:
            #    infos_dict['furnished'] = np.nan
            #    logger.debug(f'Offer {i} has no information about furniture.')
            # the offer zip code 
            if 'zip' in i:
                try:
                    infos_dict['zip_code'] = int(re.findall('\d+', i)[0])
                except:
                    infos_dict['zip_code'] = np.nan
                    logger.debug(f'Offer {i} has no information about zip_code.')
            # offer category
            if 'objectcat' in i:
                try:
                    infos_dict['main_category'] = re.findall('\:"\w+"', b[3])[0][1:].replace('"', '')
                except:
                    infos_dict['main_category'] = np.nan
                    logger.debug(f'Offer {i} has no information about main category.')
            # number of rooms
            if 'rooms' in i:
                try:
                    infos_dict['rooms'] = float(re.findall('\d+', i)[0])
                except:
                    infos_dict['rooms'] = np.nan
                    logger.debug(f'Offer {i} has no information about number of rooms.')
            # build yuear of construction
            if 'buildyear' in i:
                try:
                    infos_dict['build_year'] = int(re.findall('\d+', i)[0])
                except:
                    infos_dict['build_year'] = np.nan
                    logger.debug(f'Offer {i} has no information about build construction year.')
            # state
            if 'fed' in i:
                try:
                    infos_dict['state'] = i.split(':')[1].replace('"', '')
                except:
                    infos_dict['state'] = np.nan
                    logger.debug(f'Offer {i} has no information about state.')
            # city
            if 'city' in i:
                try:
                    infos_dict['city'] = i.split(':')[1].replace('"', '')
                except:
                    infos_dict['city'] = np.nan
                    logger.debug(f'Offer {i} has no information about city.')
            # offer sub-category
            if 'obcat' in i:
                try:
                    infos_dict['sub_category'] = i.split(':')[1].replace('"', '')
                except:
                    infos_dict['sub_category'] = np.nan
                    logger.debug(f'Offer {i} has no information about sub-category.')
            # if the offer has or not a "balcon"- balcony
            if 'balcn' in i:
                if 'true' in i:
                    infos_dict['balcony'] = 1
                elif 'false' in i:
                    infos_dict['balcony'] = 0
                else:
                    infos_dict['balcony'] = np.nan
                    logger.debug(f'Offer {i} has no information about balcony.')
            #else:
            #    infos_dict['balcony'] = np.nan
            #    logger.debug(f'Offer {i} has no information about balcony.')
            # heat type
            if 'heatr' in i:
                try:
                    infos_dict['heat_type'] = i.split(':')[1].replace('"', '')
                except:
                    infos_dict['heat_type'] = np.nan
                    logger.debug(f'Offer {i} has no information about heat type.')
            # offer title
            if 'title' in i:
                try:
                    infos_dict['offer_title'] = i.split(':')[1].replace('"', '')
                except:
                    infos_dict['offer_title'] = np.nan
                    logger.debug(f'Offer {i} has no information about offer title.')
            # if the offer has already a kitchen
            if 'kitch' in i:
                if 'true' in i:
                    infos_dict['kitchen'] = 1
                elif 'false' in i:
                    infos_dict['kitchen'] = 0
                else:
                    infos_dict['kitchen'] = np.nan
                    logger.debug(f'Offer {i} has no information about kitchen.')
            #else:
            #    infos_dict['kitchen'] = np.nan
            #    logger.debug(f'Offer {i} has no information about kitchen.')
            if 'gardn' in i:
                if 'true' in i:
                    infos_dict['garden'] = 1
                elif 'false' in i:
                    infos_dict['garden'] = 0
                else:
                    infos_dict['garden'] = np.nan
                    logger.debug(f'Offer {i} has no information about garden.')
            #else:
            #    infos_dict['garden'] = np.nan
            #    logger.debug(f'Offer {i} has no information about garden.')
            # offer rent price
            if 'price' in i:
                try:
                    infos_dict['rent_price'] = float(re.findall('\d+', i)[0])
                except:
                    infos_dict['rent_price'] = np.nan
                    logger.debug(f'Offer {i} has no information rent price.')
                    
        # append the infos about the offer           
        df_list.append(infos_dict)
        logger.info(f'Offer no. {i} cleaned.')
        
    # create a new cleaned dataframe
    df_pp = pd.DataFrame(df_list)
    
    if save:
        if not os.path.exists('../data'):
            os.makedirs('../data')
        output_dir = '../data/'
        filename = f'all_offers_infos_pp.csv'
        df_pp.to_csv(os.path.join(output_dir, filename), index=False)
        

    return None

def upload_blob():
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    bucket_name = "de_rent_bkt"
    # The path to your file to upload
    source_file_name = "../data/all_offers_infos_pp.csv"
    # The ID of your GCS object
    destination_blob_name = "de_rent_data/all_offers_infos_pp.csv"
    
    storage_client = storage.Client.from_service_account_json(
        '/Users/felipedemenechvasconcelos/keys/scenic-edition-310913-26647dbaf7a5.json')
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

def main():
    
    df_raw = pd.read_csv('../data/all_offers_infos_raw.csv')
    df_cleaned = offers_infos_preprocess(df_raw)
    upload_blob()

if __name__=='__main__':
    main()

File ../data/all_offers_infos_pp.csv uploaded to de_rent_data/all_offers_infos_pp.csv.
