In [6]:
import os
import re
import math
import requests
import psycopg2
import logging
import numpy as np
import pandas as pd

from time import sleep
from config import config
from random import randint
from bs4 import BeautifulSoup
from datetime import datetime
import psycopg2.extras as extras
from sqlalchemy import create_engine
from get_offers_by_city import connect

In [7]:
conn = connect(config())

Connecting to database...
Connection successful


In [8]:
query = '''
    SELECT * FROM all_offers_infos
'''

In [None]:
cursor = conn.cursor()
cursor.execute(query)

result = cursor.fetchall()
df = pd.DataFrame(result)
#df.columns = ['extraction_datetime', 'offer_id', 'city', 'city_code', 'type']

In [None]:
df.shape

In [6]:
df.head()

Unnamed: 0,extraction_datetime,offer_id,city,city_code,type
0,2021-10-21 17:49:51,45695894,Dresden,100051,wohnung
1,2021-10-21 17:49:51,45649068,Dresden,100051,wohnung
2,2021-10-21 17:49:51,45311912,Dresden,100051,wohnung
3,2021-10-21 17:49:51,45690786,Dresden,100051,wohnung
4,2021-10-21 17:49:51,45693139,Dresden,100051,wohnung


In [1]:
import os
import re
import math
import requests
import psycopg2
import logging
import numpy as np
import pandas as pd

from tqdm import tqdm
from time import sleep
from config import config
from random import randint
from bs4 import BeautifulSoup
from datetime import datetime
import psycopg2.extras as extras
from sqlalchemy import create_engine
from get_offers_by_city import connect


# Create log folder if not exists
if not os.path.exists('Logs'):
    os.makedirs('Logs')
    
logging.basicConfig(
    filename='Logs/get_offers_infos.txt',
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m_%d %H:%M:%S',
    level=logging.DEBUG
)

logger = logging.getLogger('get_offers_infos')

conn = connect(config())

def get_data_from_db(conn):
    '''Get the latest number os offers for each city.
    
    Parameters:
    ----------
        conn: connection to the database to extract the infos
        
    Return:
    -------
        Return a dataframe with the latest data about the number of offers in each city.
        
    '''   
    
    # create a cursor object
    cursor = conn.cursor()
    
    # get infos from the database
    query = '''
        SELECT * FROM all_offer_ids
    '''
    
    cursor.execute(query)
    result = cursor.fetchall()
    #df = pd.DataFrame(result)
    df_ids = pd.DataFrame(result, columns=['extraction_datetime', 'offer_id', 'city', 'city_code', 'type'])
    
    return df_ids

# get all infor for all founded offers
def get_offers_infos(df_ids, save=False):
    '''Get all infos from all rent offers
    
    Params:
    -------
        df_ids: offers ids source.
        save: save the returned dataframe locally or not.
        
    Returns:
    --------
        A dataframe with all offer informations in a row format.
    '''
    
    infos_list = []
    count = 0
    now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    with tqdm(total=df_ids.shape[0]) as pbar:
        for Id in set(df_ids['offer_id']):

            url = f"https://www.immonet.de/angebot/{Id}?drop=sel&related=false&product=standard"
            headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
            page = requests.get(url, headers=headers)
            soup = BeautifulSoup(page.text, "html.parser")

            #panels_infos = soup.findAll('div', class_='row box-50')
            # get all infos about each offer (all infos are mixed)
            
            # get all infos in almost a json format
            script_infos = soup.select('script', type_='text/javascript')
            
            for i in script_infos:
                if 'targetingParams' in i.text:
                    c = i.text
                    infos = re.findall('\{(.*?)\}', c)
                if 'initModalMap' in i.text:
                    try:
                        c = i.text.replace('\n', '').replace('\t', '')
                        lat_lng = re.findall('\{lat: \d+.\d+,lng: \d+.\d+}', c)
                    except:
                        lat_lng = np.nan
            #offer_infos = []
            #for panel in panels_infos:
            #    text = panel.text.replace('\n', '').replace('\t', '-')
            #    offer_infos.append(text)



            infos_list.append({'offer_id': Id,
                               'extraction_date': now,
                               'city': df_ids['city'][count],
                               'city_code': df_ids['city_code'][count],
                               'offer_type': df_ids['type'][count],
                               'lat_lng': lat_lng,
                               'offer_infos': infos})
            sleep(1)
            pbar.update(1)
            #print(count)
            count += 1

    df_infos = pd.DataFrame(infos_list)
    df_infos.drop_duplicates(subset='offer_id', inplace=True)
    logger.info('df_infos created')
    
    if save:
        now2 = datetime.now().strftime('%Y_%m_%d')
        df_infos.to_csv(f'../data/df_infos_{now2}.csv', index=False)
        
    return df_infos

def load_offer_ids(df_infos, conn):
    '''Get the informations and store in a database
    
    Params:
    -------
        df_infos: dataframe to be stored.
        conn: connection to the database.
    Return:
    -------
        None

    '''
    table_name = 'all_offers_infos'
    # delete table 
    query1 = f'DROP TABLE IF EXISTS {table_name}'
    cursor = conn.cursor()
    try:
        cursor.execute(query1)
        conn.commit()
        logger.info('Old table droped')
        print(f'Deleted {table_name} table.')
    except (Exception, psycopg2.DatabaseError) as error:
        logger.error(f"Error: {error}")
        print(f"Error: {error}")
        conn.rollback()
        cursor.close()
        return 1
    query2 = f'''CREATE TABLE IF NOT EXISTS {table_name} (
        offer_id INTEGER,
        extraction_date TEXT,
        city VARCHAR(50),
        city_code INTEGER,
        offer_type VARCHAR(50),
        offer_infos TEXT
    )'''
    try:
        cursor.execute(query2)
        conn.commit()
        logger.info('New table created')
        print('Recreated all_offer_ids table.')
    except (Exception, psycopg2.DatabaseError) as error:
        logger.error(f"Error: {error}")
        print(f"Error: {error}")
        conn.rollback()
        cursor.close()
        return 2
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in df_infos.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df_infos.columns))
    # SQL quert to execute
    query3 = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols)
    try:
        extras.execute_values(cursor, query3, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        #logger.error(f"Error: {error}")
        conn.rollback()
        cursor.close()
        print(error)
        return 3
    logger.info(f"{table_name} uptodate.")
    print(f"{table_name} uptodate.")
    cursor.close()
    
    return None

Connecting to database...
Connection successful


In [4]:
url = f"https://www.immonet.de/angebot/45776903?drop=sel&related=false&product=standard"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, "html.parser")

a = soup.select('script', type_='text/javascript')

In [5]:
a

[<script src="https://navigation.immonet.de/v1/FULL/195/navigation.js"></script>,
 <script defer="" src="https://maps.googleapis.com/maps/api/js?key=AIzaSyAhKQ_dVVEM3riTDO4JDCB2ZSnkb8Iw6y8&amp;v=3.32.5a&amp;callback=initModalMap"></script>,
 <script src="//cdnglobal.immonet.de/frontend-resources/5.21.3/lib/gmaps/gmaps.js"></script>,
 <script src="//cdnglobal.immonet.de/frontend-resources/5.21.3/immonet/js/expose-head.min.js?v=187"></script>,
 <script src="https://cdn-a.yieldlove.com/v2/yieldlove.js?immonet.de"></script>,
 <script src="https://cdn.immonet.de/adtag-snippets/9.8.2/projects/expose.min.js"></script>,
 <script type="text/javascript">
     var sdmTargetingParams = JSON.parse('{"area":40,"mobex":true,"zip":"01069","objectcat":"Wohnung","pers":"1342334","rooms":2,"buildyear":2021,"fed":"Sachsen","city":"Dresden","obcon":"Erstbezug","obcat":"Etagenwohnung","balcn":false,"pic":"https://i.immonet.de/41/33/15/741413315_156x88.jpg","title":"Zwischen Elbe und Großem Garten - Neubau 2

In [18]:
df1 = pd.read_csv('../data/df_infos_2021_10_12.csv')

In [18]:
df = df.head()

In [23]:
get_offers_infos(df_ids=df)

100%|██████████| 5/5 [00:07<00:00,  1.50s/it]


Unnamed: 0,offer_id,extraction_date,city,city_code,offer_type,lat_lng,offer_infos
0,45690786,2021-10-28 18:06:29,Dresden,100051,wohnung,"[{lat: 51.05713,lng: 13.71334}]","[, ""area"":66.49,""mobex"":true,""zip"":""01067"",""ob..."
1,45311912,2021-10-28 18:06:29,Dresden,100051,wohnung,"[{lat: 51.089961730476176,lng: 13.729180415431...","[, ""area"":44.05,""mobex"":true,""zip"":""01127"",""ob..."
2,45649068,2021-10-28 18:06:29,Dresden,100051,wohnung,"[{lat: 51.07218,lng: 13.74477}]","[, ""area"":104.8,""mobex"":true,""zip"":""01097"",""ob..."
3,45693139,2021-10-28 18:06:29,Dresden,100051,wohnung,"[{lat: 51.03585,lng: 13.79821}]","[, ""area"":60,""mobex"":true,""zip"":""01277"",""objec..."
4,45695894,2021-10-28 18:06:29,Dresden,100051,wohnung,"[{lat: 51.08009,lng: 13.75074}]","[, ""area"":66,""mobex"":true,""zip"":""01099"",""objec..."


In [21]:
load_offer_ids(df_infos=df1, conn=conn)

Deleted all_offers_infos table.
Recreated all_offer_ids table.
all_offers_infos table is uptodate.
all_offers_infos uptodate.


In [20]:
df1.head()

Unnamed: 0,extraction_datetime,offer_id,city,city_code,type
0,2021-10-21 17:49:51,45695894,Dresden,100051,wohnung
1,2021-10-21 17:49:51,45649068,Dresden,100051,wohnung
2,2021-10-21 17:49:51,45311912,Dresden,100051,wohnung
3,2021-10-21 17:49:51,45690786,Dresden,100051,wohnung
4,2021-10-21 17:49:51,45693139,Dresden,100051,wohnung


In [1]:
# get all infor for all founded offers
def get_offers_infos(df_ids, save=False):
    '''
    Here we will get the infos from the info panels for each offer.
    '''
    count = 0
    #infos_list = []
    #datetime_list = []
    now = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')

    for Id in set(df_ids['offer_id']):

        url = f"https://www.immonet.de/angebot/{Id}?drop=sel&related=false&product=standard"
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.text, "html.parser")

        panels_infos = soup.findAll('div', class_='row box-50')
        # get all infos about each offer (all infos are mixed)

        offer_infos = []
        for panel in panels_infos:
            text = panel.text.replace('\n', '').replace('\t', '-')
            offer_infos.append(text)



        infos_list.append({'offer_id': Id,
                           'extraction_date': now,
                           'city': df_ids['city'][count],
                           'city_code': df_ids['city_code'][count],
                           'offer_type': df_ids['type'][count],
                           'offer_infos': offer_infos})
        #print(count)
        count += 1

        #if len(offer_infos) != 0:
        #    datetime_list.append(datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))
        #    infos_list.append(offer_infos)
        #else:
        #    infos_list.append(np.nan)
        #    print('ERROR')

    df_infos = pd.DataFrame(infos_list)
    df_infos.drop_duplicates(subset='offer_id', inplace=True)
    
    if save:
        now2 = datetime.now().strftime('%Y_%m_%d')
        df_infos.to_csv(f'../data/df_infos_{now2}.csv', index=False)
        
    return df_infos

### Load to DB test

In [None]:
import os
import re
import math
import requests
import psycopg2
import logging
import numpy as np
import pandas as pd

from tqdm import tqdm
from time import sleep
from config import config
from random import randint
from bs4 import BeautifulSoup
from datetime import datetime
import psycopg2.extras as extras
from sqlalchemy import create_engine
from get_offers_by_city import connect

In [5]:
def load_offer_ids(df_infos, conn):
    '''Get the informations and store in a database
    
    Params:
    -------
        df_infos: dataframe to be stored.
        conn: connection to the database.
    Return:
    -------
        None

    '''
    table_name = 'all_offers_raw_infos'
    # delete table 
    query1 = f'DROP TABLE IF EXISTS {table_name}'
    cursor = conn.cursor()
    print('initiated...')
    try:
        cursor.execute(query1)
        conn.commit()
        logger.info('Old table droped')
        print(f'Deleted {table_name} table.')
    except (Exception, psycopg2.DatabaseError) as error:
        logger.error(f"Error: {error}")
        print(f"Error: {error}")
        conn.rollback()
        cursor.close()
        return 1
    query2 = f'''CREATE TABLE IF NOT EXISTS {table_name} (
        offer_id INTEGER,
        extraction_date TEXT,
        city VARCHAR(50),
        city_code INTEGER,
        offer_type VARCHAR(50),
        lat_lng TEXT,
        offer_infos TEXT)'''
    try:
        cursor.execute(query2)
        conn.commit()
        logger.info('New table created')
        print('Recreated all_offer_ids table.')
    except (Exception, psycopg2.DatabaseError) as error:
        logger.error(f"Error: {error}")
        print(f"Error: {error}")
        conn.rollback()
        cursor.close()
        return 2
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in df_infos.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df_infos.columns))
    # SQL quert to execute
    query3 = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols)
    try:
        extras.execute_values(cursor, query3, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        #logger.error(f"Error: {error}")
        conn.rollback()
        cursor.close()
        print(error)
        return 3
    logger.info(f"{table_name} uptodate.")
    print(f"{table_name} uptodate.")
    cursor.close()
    
    return None

In [6]:
conn = connect(config())

Connecting to database...
Connection successful


In [7]:
df_test = pd.read_csv('../data/df_infos_2021_11_03.csv')

In [8]:
load_offer_ids(df_infos=df_test, conn=conn)

initiated...
Deleted all_offers_raw_infos table.
Recreated all_offer_ids table.
all_offers_raw_infos uptodate.


In [4]:
drop_query = 'DROP TABLE IF EXISTS all_offers_raw_infos'
create_query = '''
CREATE TABLE IF NOT EXISTS all_offers_raw_infos (
    offer_id INTEGER,
    extraction_date TEXT,
    city VARCHAR(50),
    city_code INTEGER,
    offer_type VARCHAR(50),
    lat_lng TEXT,
    offer_infos TEXT)
'''

cursor = conn.cursor()
cursor.execute(create_query)
conn.commit()