# Real estate prices data pipeline

In [1]:
#Importing modules

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import pyodbc
from sqlalchemy import create_engine
import urllib

## Get data from the housing website

In [2]:
#Price
#Creating a loop that gets price information from 10 pages
page = 1
price_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.select('p strong'):
            price_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_prices = pd.DataFrame(price_column, columns = ['Price'])
print(f'Loaded {len(df_prices)} price rows')

Loaded 270 price rows


In [3]:
#Square meters
#Creating a loop that gets sqm information from 10 pages
page = 1
sqm_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Buto plotas (kv. m)"}):
            sqm_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_sqm = pd.DataFrame(sqm_column, columns = ['Area'])
print(f'Loaded {len(df_sqm)} sqm rows')

Loaded 270 sqm rows


In [4]:
#Rooms
#Creating a loop that gets room information from 10 pages
page = 1
rooms_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Kambarių skaičius"}):
            rooms_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_rooms = pd.DataFrame(rooms_column, columns = ['Rooms'])
print(f'Loaded {len(df_rooms)} rooms rows')

Loaded 270 rooms rows


In [5]:
#Year
#Creating a loop that gets year information from 10 pages
page = 1
years_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Statybos metai"}):
            years_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_year = pd.DataFrame(years_column, columns = ['Year'])
print(f'Loaded {len(df_year)} year rows')

Loaded 270 year rows


In [6]:
#Floor
#Creating a loop that gets floor information from 10 pages
page = 1
floor_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Aukštas"}):
            floor_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_floor = pd.DataFrame(floor_column, columns = ['Floor'])
print(f'Loaded {len(df_floor)} floor rows')

Loaded 270 floor rows


In [7]:
#Links
#Creating a loop that gets links information from 10 pages
page = 1
links_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all("h2", "title-list"):
            links_column.append(element.a.get("href"))
      page = page + 1
df_link = pd.DataFrame(links_column, columns = ['Link'])
print(f'Loaded {len(df_link)} link rows')

Loaded 270 link rows


In [34]:
#Location
#Creating a loop that gets links information from 10 pages
page = 1
location_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all("h2", "title-list"):
            location_column.append(element.a.get("title"))
      page = page + 1
df_location = pd.DataFrame(location_column, columns = ['Location'])
print(f'Loaded {len(df_link)} location rows')

Loaded 270 location rows


In [83]:
#Add all the columns together

df_columns = [
    df_prices,
    df_sqm,
    df_rooms,
    df_location,
    df_year,
    df_floor,
    df_link
]

df_re = pd.concat(df_columns, axis=1, join='inner')
df_re.insert(0, 'TimeStamp', pd.to_datetime('now').tz_localize('Antarctica/South_Pole').replace(microsecond=0))
df_re.head()

Unnamed: 0,TimeStamp,Price,Area,Rooms,Location,Year,Floor,Link
0,2022-05-30 15:42:57+12:00,48 700 €,25.00 m²,1 kamb.,"1 kambario butas Vilniuje, Naujininkuose, Šalt...",1980 m.,3/5 a.,https://domoplius.lt/skelbimai/parduodamas-1-k...
1,2022-05-30 15:42:57+12:00,239 000 €,79.00 m²,4 kamb.,"4 kambarių butas Vilniuje, Bendorėliuose, Bend...",2018 m.,3/3 a.,https://domoplius.lt/skelbimai/parduodamas-4-k...
2,2022-05-30 15:42:57+12:00,160 000 €,90.00 m²,4 kamb.,"4 kambarių butas Vilniuje, Pilaitėje, Dituvos g.",2021 m.,2/3 a.,https://domoplius.lt/skelbimai/parduodamas-4-k...
3,2022-05-30 15:42:57+12:00,212 000 €,58.82 m²,3 kamb.,"3 kambarių butas Vilniuje, Pilaitėje, Karaliau...",2022 m.,8/8 a.,https://domoplius.lt/skelbimai/parduodamas-3-k...
4,2022-05-30 15:42:57+12:00,147 900 €,53.68 m²,3 kamb.,"3 kambarių butas Vilniuje, Pilaitėje, Gilužio g.",2022 m.,2/8 a.,https://domoplius.lt/skelbimai/parduodamas-3-k...


## Cleaning data

In [84]:
df_re['Rooms'] = df_re['Rooms'].str.replace("kamb.", "").astype(int)
df_re['Price'] = df_re['Price'].str.replace("€", "").str.replace(" ", "").astype(int)
df_re['Year'] = df_re['Year'].str.replace(" m.", "").str.replace("m.", "")
df_re['Floor'] = df_re['Floor'].str.replace(" a.", "")
df_re['Area'] = df_re['Area'].str.replace(" m²", "").astype(str).astype(float)
df_re.replace('', np.nan)
df_re.head()

  df_re['Rooms'] = df_re['Rooms'].str.replace("kamb.", "").astype(int)
  df_re['Year'] = df_re['Year'].str.replace(" m.", "").str.replace("m.", "")
  df_re['Floor'] = df_re['Floor'].str.replace(" a.", "")


Unnamed: 0,TimeStamp,Price,Area,Rooms,Location,Year,Floor,Link
0,2022-05-30 15:42:57+12:00,48700,25.0,1,"1 kambario butas Vilniuje, Naujininkuose, Šalt...",1980,3/5,https://domoplius.lt/skelbimai/parduodamas-1-k...
1,2022-05-30 15:42:57+12:00,239000,79.0,4,"4 kambarių butas Vilniuje, Bendorėliuose, Bend...",2018,3/3,https://domoplius.lt/skelbimai/parduodamas-4-k...
2,2022-05-30 15:42:57+12:00,160000,90.0,4,"4 kambarių butas Vilniuje, Pilaitėje, Dituvos g.",2021,2/3,https://domoplius.lt/skelbimai/parduodamas-4-k...
3,2022-05-30 15:42:57+12:00,212000,58.82,3,"3 kambarių butas Vilniuje, Pilaitėje, Karaliau...",2022,8/8,https://domoplius.lt/skelbimai/parduodamas-3-k...
4,2022-05-30 15:42:57+12:00,147900,53.68,3,"3 kambarių butas Vilniuje, Pilaitėje, Gilužio g.",2022,2/8,https://domoplius.lt/skelbimai/parduodamas-3-k...


## Adding additional columns

In [85]:
df_re.insert(3, 'PriceSqm', df_re['Price'] / df_re['Area'])
df_re.head()

Unnamed: 0,TimeStamp,Price,Area,PriceSqm,Rooms,Location,Year,Floor,Link
0,2022-05-30 15:42:57+12:00,48700,25.0,1948.0,1,"1 kambario butas Vilniuje, Naujininkuose, Šalt...",1980,3/5,https://domoplius.lt/skelbimai/parduodamas-1-k...
1,2022-05-30 15:42:57+12:00,239000,79.0,3025.316456,4,"4 kambarių butas Vilniuje, Bendorėliuose, Bend...",2018,3/3,https://domoplius.lt/skelbimai/parduodamas-4-k...
2,2022-05-30 15:42:57+12:00,160000,90.0,1777.777778,4,"4 kambarių butas Vilniuje, Pilaitėje, Dituvos g.",2021,2/3,https://domoplius.lt/skelbimai/parduodamas-4-k...
3,2022-05-30 15:42:57+12:00,212000,58.82,3604.216253,3,"3 kambarių butas Vilniuje, Pilaitėje, Karaliau...",2022,8/8,https://domoplius.lt/skelbimai/parduodamas-3-k...
4,2022-05-30 15:42:57+12:00,147900,53.68,2755.216095,3,"3 kambarių butas Vilniuje, Pilaitėje, Gilužio g.",2022,2/8,https://domoplius.lt/skelbimai/parduodamas-3-k...


In [86]:
df_re[['ApartmentFloor', 'TotalFloors']] = df_re['Floor'].str.split('/', expand=True)
df_re['TotalFloors'] = df_re['TotalFloors'].replace('', np.nan, regex=True)
df_re = df_re.drop('Floor', 1)
df_re.head()

  df_re = df_re.drop('Floor', 1)


Unnamed: 0,TimeStamp,Price,Area,PriceSqm,Rooms,Location,Year,Link,ApartmentFloor,TotalFloors
0,2022-05-30 15:42:57+12:00,48700,25.0,1948.0,1,"1 kambario butas Vilniuje, Naujininkuose, Šalt...",1980,https://domoplius.lt/skelbimai/parduodamas-1-k...,3,5
1,2022-05-30 15:42:57+12:00,239000,79.0,3025.316456,4,"4 kambarių butas Vilniuje, Bendorėliuose, Bend...",2018,https://domoplius.lt/skelbimai/parduodamas-4-k...,3,3
2,2022-05-30 15:42:57+12:00,160000,90.0,1777.777778,4,"4 kambarių butas Vilniuje, Pilaitėje, Dituvos g.",2021,https://domoplius.lt/skelbimai/parduodamas-4-k...,2,3
3,2022-05-30 15:42:57+12:00,212000,58.82,3604.216253,3,"3 kambarių butas Vilniuje, Pilaitėje, Karaliau...",2022,https://domoplius.lt/skelbimai/parduodamas-3-k...,8,8
4,2022-05-30 15:42:57+12:00,147900,53.68,2755.216095,3,"3 kambarių butas Vilniuje, Pilaitėje, Gilužio g.",2022,https://domoplius.lt/skelbimai/parduodamas-3-k...,2,8


In [87]:
df_re[['Type', 'Location2']] = df_re['Location'].str.split('Vilniuje,', expand = True)
df_re = df_re.drop('Location', 1)
df_re[['Neighborhood', 'Street']] = df_re['Location2'].str.split(',', expand = True)
df_re = df_re.drop('Location2', 1)
df_re['City'] = 'Vilnius'
df_re.head()

  df_re = df_re.drop('Location', 1)
  df_re = df_re.drop('Location2', 1)


Unnamed: 0,TimeStamp,Price,Area,PriceSqm,Rooms,Year,Link,ApartmentFloor,TotalFloors,Type,Neighborhood,Street,City
0,2022-05-30 15:42:57+12:00,48700,25.0,1948.0,1,1980,https://domoplius.lt/skelbimai/parduodamas-1-k...,3,5,1 kambario butas,Naujininkuose,Šaltkalvių g.,Vilnius
1,2022-05-30 15:42:57+12:00,239000,79.0,3025.316456,4,2018,https://domoplius.lt/skelbimai/parduodamas-4-k...,3,3,4 kambarių butas,Bendorėliuose,Bendorių g.,Vilnius
2,2022-05-30 15:42:57+12:00,160000,90.0,1777.777778,4,2021,https://domoplius.lt/skelbimai/parduodamas-4-k...,2,3,4 kambarių butas,Pilaitėje,Dituvos g.,Vilnius
3,2022-05-30 15:42:57+12:00,212000,58.82,3604.216253,3,2022,https://domoplius.lt/skelbimai/parduodamas-3-k...,8,8,3 kambarių butas,Pilaitėje,Karaliaučiaus g.,Vilnius
4,2022-05-30 15:42:57+12:00,147900,53.68,2755.216095,3,2022,https://domoplius.lt/skelbimai/parduodamas-3-k...,2,8,3 kambarių butas,Pilaitėje,Gilužio g.,Vilnius


In [88]:
# Dropping duplicate values
df_re.drop_duplicates(keep=False,inplace=True)
df_re.head()

Unnamed: 0,TimeStamp,Price,Area,PriceSqm,Rooms,Year,Link,ApartmentFloor,TotalFloors,Type,Neighborhood,Street,City
0,2022-05-30 15:42:57+12:00,48700,25.0,1948.0,1,1980,https://domoplius.lt/skelbimai/parduodamas-1-k...,3,5,1 kambario butas,Naujininkuose,Šaltkalvių g.,Vilnius
1,2022-05-30 15:42:57+12:00,239000,79.0,3025.316456,4,2018,https://domoplius.lt/skelbimai/parduodamas-4-k...,3,3,4 kambarių butas,Bendorėliuose,Bendorių g.,Vilnius
2,2022-05-30 15:42:57+12:00,160000,90.0,1777.777778,4,2021,https://domoplius.lt/skelbimai/parduodamas-4-k...,2,3,4 kambarių butas,Pilaitėje,Dituvos g.,Vilnius
3,2022-05-30 15:42:57+12:00,212000,58.82,3604.216253,3,2022,https://domoplius.lt/skelbimai/parduodamas-3-k...,8,8,3 kambarių butas,Pilaitėje,Karaliaučiaus g.,Vilnius
4,2022-05-30 15:42:57+12:00,147900,53.68,2755.216095,3,2022,https://domoplius.lt/skelbimai/parduodamas-3-k...,2,8,3 kambarių butas,Pilaitėje,Gilužio g.,Vilnius


## Load data to SQL Server

In [91]:
DRIVER_NAME = 'SQL Server'
SERVER_NAME = '' #Enter server
DATABASE_NAME = '' #Enter database

#uid=<username>;
#pwd=<password>


connection = f"""
            DRIVER={{{DRIVER_NAME}}};
            SERVER={SERVER_NAME};
            DATABASE={DATABASE_NAME};
            Trust_connection=yes;  
"""


cnxn = pyodbc.connect(connection)
cursor = cnxn.cursor()

# Insert Dataframe into SQL Server:
for index, row in df_re.iterrows():
     cursor.execute("INSERT INTO RealEstatePrices (TimeStamp, Price, Area, PriceSqm, Rooms, Year, Link, ApartmentFloor, TotalFloors, Type, City, Neighborhood, Street) values(?,?,?,?,?,?,?,?,?,?,?,?,?)", row.TimeStamp, row.Price, row.Area, row.PriceSqm, row.Rooms, row.Year, row.Link, row.ApartmentFloor, row.TotalFloors, row.Type, row.City, row.Neighborhood, row.Street)
cnxn.commit()
cursor.close()