In [12]:
#Importing modules
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import datetime as dt
import pyodbc
from sqlalchemy import create_engine
import urllib
from geopy.geocoders import Nominatim

## Get data from all pages of the housing website

In [3]:
#Price
#Creating a loop that gets price information from all 159 pages
page = 1
price_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.select('p strong'):
            price_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_prices = pd.DataFrame(price_column, columns = ['Price'])
print(f'Loaded {len(df_prices)} price rows')

Loaded 270 price rows


In [4]:
#Square meters
#Creating a loop that gets sqm information from all 159 pages
page = 1
sqm_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Buto plotas (kv. m)"}):
            sqm_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_sqm = pd.DataFrame(sqm_column, columns = ['Area'])
print(f'Loaded {len(df_sqm)} sqm rows')

Loaded 270 sqm rows


In [5]:
#Rooms
#Creating a loop that gets room information from all 159 pages
page = 1
rooms_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Kambarių skaičius"}):
            rooms_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_rooms = pd.DataFrame(rooms_column, columns = ['Rooms'])
print(f'Loaded {len(df_rooms)} rooms rows')

Loaded 270 rooms rows


In [6]:
#Year
#Creating a loop that gets year information from all 159 pages
page = 1
years_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Statybos metai"}):
            years_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_year = pd.DataFrame(years_column, columns = ['Year'])
print(f'Loaded {len(df_year)} year rows')

Loaded 270 year rows


In [10]:
#Floor
#Creating a loop that gets floor information from all 159 pages
page = 1
floor_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all(attrs={"title" : "Aukštas"}):
            floor_column.append(element.get_text(separator=" ", strip=True))
      page = page + 1
df_floor = pd.DataFrame(floor_column, columns = ['Floor'])
print(f'Loaded {len(df_floor)} floor rows')

Loaded 270 floor rows


In [7]:
#Links
#Creating a loop that gets links information from all 159 pages
page = 1
links_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all("h2", "title-list"):
            links_column.append(element.a.get("href"))
      page = page + 1
df_link = pd.DataFrame(links_column, columns = ['Link'])
print(f'Loaded {len(df_link)} link rows')

Loaded 270 link rows


In [8]:
#Location
#Creating a loop that gets links information from all 159 pages
page = 1
location_column = []
while page != 10:
      url = f"https://domoplius.lt/skelbimai/butai?action_type=1&address_1=461&category_search=1&page_nr={page}&slist=141259750"
      response = requests.get(url)
      response_content = response.content
      soup = bs(response_content, 'html.parser')
    
      for element in soup.find_all("h2", "title-list"):
            location_column.append(element.a.get("title"))
      page = page + 1
df_location = pd.DataFrame(location_column, columns = ['Location'])
print(f'Loaded {len(df_link)} location rows')

Loaded 270 location rows


## Creating a complete dataset

In [14]:
#Add all the columns together
df_columns = [
    df_prices,
    df_sqm,
    df_rooms,
    df_location,
    df_year,
    df_floor,
    df_link
]

df_re = pd.concat(df_columns, axis=1, join='inner')

#Clean rooms column
df_re['Rooms'] = df_re['Rooms'].str.replace("kamb.", "").astype(int)

#Clean price column
df_re['Price'] = df_re['Price'].str.replace("€", "").str.replace(" ", "").astype(int)

#Clean year column
df_re['Year'] = df_re['Year'].str.replace(" m.", "").str.replace("m.", "")

#Clean floor column
df_re['Floor'] = df_re['Floor'].str.replace(" a.", "")
df_re[['ApartmentFloor', 'TotalFloors']] = df_re['Floor'].str.split('/', expand=True)
df_re['TotalFloors'] = df_re['TotalFloors'].replace('', np.nan, regex=True)
df_re = df_re.drop('Floor', 1)

#Clean area column
df_re['Area'] = df_re['Area'].str.replace(" m²", "").astype(str).astype(float)

#Split location column to city, neighborhood, street
df_re[['Type', 'Location2']] = df_re['Location'].str.split('Vilniuje,', expand = True)
df_re = df_re.drop('Location', 1)
df_re[['Neighborhood', 'Street']] = df_re['Location2'].str.split(',', expand = True)
df_re = df_re.drop('Location2', 1)
df_re['City'] = 'Vilnius'
df_re['Address'] = df_re['City'] + ', ' + df_re['Street']

#Add date column
df_re.insert(0, 'Date', dt.datetime.now().replace(microsecond=0))

#Add price per sq. meter column
df_re.insert(3, 'PriceSqm', df_re['Price'] / df_re['Area'])

#Add additional latitude and longitude columns for geospatial visualisation
geolocator = Nominatim(user_agent="Map")
latitude=[]
longitude=[]
for address in df_re["Address"]:
    location = geolocator.geocode(address)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
df_re['Latitude']=latitude
df_re['Longitude']=longitude

df_re.replace('', np.nan)
df_re.head()

  df_re['Rooms'] = df_re['Rooms'].str.replace("kamb.", "").astype(int)
  df_re['Year'] = df_re['Year'].str.replace(" m.", "").str.replace("m.", "")
  df_re['Floor'] = df_re['Floor'].str.replace(" a.", "")
  df_re = df_re.drop('Floor', 1)
  df_re = df_re.drop('Location', 1)
  df_re = df_re.drop('Location2', 1)


Unnamed: 0,Date,Price,Area,PriceSqm,Rooms,Year,Link,ApartmentFloor,TotalFloors,Type,Neighborhood,Street,City,Address,Latitude,Longitude
0,2022-06-16 20:32:29,279000,66.51,4194.857916,3,2014,https://domoplius.lt/skelbimai/parduodamas-3-k...,2,9,3 kambarių butas,Naujamiestyje,A. Vivulskio g.,Vilnius,"Vilnius, A. Vivulskio g.",54.679536,25.269072
1,2022-06-16 20:32:29,195000,83.52,2334.770115,3,2005,https://domoplius.lt/skelbimai/parduodamas-3-k...,6,8,3 kambarių butas,Pašilaičiuose,Ukmergės g.,Vilnius,"Vilnius, Ukmergės g.",54.737012,25.230722
2,2022-06-16 20:32:29,200000,80.0,2500.0,3,2008,https://domoplius.lt/skelbimai/parduodamas-3-k...,5,9,3 kambarių butas,Pašilaičiuose,Perkūnkiemio g.,Vilnius,"Vilnius, Perkūnkiemio g.",54.736542,25.21738
3,2022-06-16 20:32:29,165000,65.54,2517.546536,2,2006,https://domoplius.lt/skelbimai/parduodamas-2-k...,8,11,2 kambarių butas,Santariškėse,Bajorų kel.,Vilnius,"Vilnius, Bajorų kel.",54.755798,25.260409
4,2022-06-16 20:32:29,165000,65.54,2517.546536,2,2006,https://domoplius.lt/skelbimai/parduodamas-2-k...,8,11,2 kambarių butas,Visoriuose,Bajorų kel.,Vilnius,"Vilnius, Bajorų kel.",54.755798,25.260409


## Loading data to SQL Server

In [None]:
DRIVER_NAME = 'SQL Server'
SERVER_NAME = '' #Enter server
DATABASE_NAME = '' #Enter database

#uid=<username>;
#pwd=<password>


connection = f"""
            DRIVER={{{DRIVER_NAME}}};
            SERVER={SERVER_NAME};
            DATABASE={DATABASE_NAME};
            Trust_connection=yes;  
"""


cnxn = pyodbc.connect(connection)
cursor = cnxn.cursor()

# Insert Dataframe into SQL Server:
for index, row in df_re.iterrows():
     cursor.execute("INSERT INTO RealEstatePrices (Date, Price, Area, PriceSqm, Rooms, Year, Link, ApartmentFloor, TotalFloors, Type, City, Neighborhood, Street) values(?,?,?,?,?,?,?,?,?,?,?,?,?)", row.Date, row.Price, row.Area, row.PriceSqm, row.Rooms, row.Year, row.Link, row.ApartmentFloor, row.TotalFloors, row.Type, row.City, row.Neighborhood, row.Street)
cnxn.commit()
cursor.close()

## Visualising gathered data

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

px.set_mapbox_access_token("pk.eyJ1IjoiZ2VkaW1pbmFzZ3J1b2QiLCJhIjoiY2wxcmpsOGk4MXFoZTNpcXIzeGxtYTJ2ayJ9.N2QdFZQgp_DOZZJXh2Ectw")
fig = px.scatter_mapbox(df_re, lat="Latitude", lon="Longitude",color="PriceSqm", size="PriceSqm",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=50,zoom=12)
fig.show()