In [3]:
%load_ext autoreload
%autoreload 2

import pathlib
import psycopg2
import psycopg2.extras
import pandas as pd

from create_table import init_database_and_tables

from sql_queries import (
    review_insert, review_check,
    calendar_insert, calendar_check,
    listings_insert, listings_check,
    reviewer_insert, reviewer_check,
    time_insert, time_check
)

psycopg2.__version__

'2.9.3 (dt dec pq3 ext lo64)'

In [10]:
# import numpy as np
# from psycopg2.extensions import register_adapter, AsIs

# def addapt_numpy_array(numpy_array):
#     return AsIs(tuple(numpy_array))

# register_adapter(np.ndarray, addapt_numpy_array)

In [51]:
init_database_and_tables()

In [15]:
conn = psycopg2.connect("host=172.21.0.2 dbname=airbnbdb port=5432 user=airbnb password=airbnb")

print("PostgreSQL server information")
print(conn.get_dsn_parameters(), "\n")

cur = conn.cursor()

PostgreSQL server information
{'user': 'airbnb', 'channel_binding': 'prefer', 'dbname': 'airbnbdb', 'host': '172.21.0.2', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 



In [13]:
def execute_values(conn, df, table):
    """
    Using psycopg2.extras.execute_values() to insert the dataframe
    """
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES %%s ON CONFLICT DO NOTHING" % (table, cols)
    cursor = conn.cursor()
    try:
        psycopg2.extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("execute_values() done")
    cursor.close()

# Merge Data

In [27]:
listings_March = pd.read_csv("../../data/listings.csv")
listings_June = pd.read_csv("../../data/listings_1.csv")
listings_September = pd.read_csv("../../data/listings_2.csv")
listings_December = pd.read_csv("../../data/listings_3.csv")

calendar_March = pd.read_csv("../../data/calendar.csv")
calendar_June = pd.read_csv("../../data/calendar_1.csv")
calendar_September = pd.read_csv("../../data/calendar_2.csv")
calendar_December = pd.read_csv("../../data/calendar_3.csv")

reivew_March = pd.read_csv("../../data/reviews.csv")
reivew_June = pd.read_csv("../../data/reviews_1.csv")
reivew_September = pd.read_csv("../../data/reviews_2.csv")
reivew_December = pd.read_csv("../../data/reviews_3.csv")


listings_list = [listings_March, listings_June, listings_September, listings_December]
calendar_list = [calendar_June]
review_list = [reivew_March]

listings = pd.concat(listings_list)
calendar = pd.concat(calendar_list)
review = pd.concat(review_list)

In [28]:
listings.shape

(56846, 74)

In [53]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,304189,2021-06-06,f,$135.00,$135.00,2.0,1125.0
1,62801,2021-06-06,f,$750.00,$750.00,30.0,30.0
2,62801,2021-06-07,f,$750.00,$750.00,30.0,30.0
3,62801,2021-06-08,f,$750.00,$750.00,30.0,30.0
4,62801,2021-06-09,f,$750.00,$750.00,30.0,30.0


In [30]:
review.shape

(438265, 6)

# Reviewer table

In [7]:
reviewer_table = review[["reviewer_id", "reviewer_name"]]
reviewer_table

Unnamed: 0,reviewer_id,reviewer_name
0,10952,Lam
1,12798,Alice
2,11869,Natalja
3,14064,Enrique
4,17977,Sherwin
...,...,...
272051,23491429,Marie-Eve
272052,234433792,Charles
272053,249850060,Emilien
272054,80402031,Jasna


In [8]:
reviewer_table.isna().sum()

reviewer_id      0
reviewer_name    0
dtype: int64

In [19]:
execute_values(conn, reviewer_table, table='reviewer')

execute_values() done


In [20]:
cur.execute(reviewer_check)

for row in cur.fetchall():
    print(row)

(10952, 'Lam')
(12798, 'Alice')
(11869, 'Natalja')
(14064, 'Enrique')
(17977, 'Sherwin')


# Time table

In [31]:
time_table = calendar[["date"]]
time_table["date"] = pd.to_datetime(time_table.date, format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time_table["date"] = pd.to_datetime(time_table.date, format='%Y-%m-%d')


In [32]:
t = time_table['date']
dt = t.dt

In [33]:
time_data = (t, dt.day, dt.month, dt.year, dt.isocalendar().week, dt.weekday)
column_labels = ("date_time", "day", "month", "year", "week","dayofweek")

In [34]:
time_df = pd.DataFrame({k: v for k, v in zip(column_labels, time_data)})
time_df.head()

Unnamed: 0,date_time,day,month,year,week,dayofweek
0,2021-06-06,6,6,2021,22,6
1,2021-06-06,6,6,2021,22,6
2,2021-06-07,7,6,2021,23,0
3,2021-06-08,8,6,2021,23,1
4,2021-06-09,9,6,2021,23,2


In [35]:
execute_values(conn, time_df, table='time')

execute_values() done


In [36]:
cur.execute(time_check)

for row in cur.fetchall():
    print(row)

(datetime.datetime(2021, 6, 6, 0, 0), 6, 6, 2021, 22, 6)
(datetime.datetime(2021, 6, 7, 0, 0), 7, 6, 2021, 23, 0)
(datetime.datetime(2021, 6, 8, 0, 0), 8, 6, 2021, 23, 1)
(datetime.datetime(2021, 6, 9, 0, 0), 9, 6, 2021, 23, 2)
(datetime.datetime(2021, 6, 10, 0, 0), 10, 6, 2021, 23, 3)


# Listings

In [7]:
col = [ 'id',
        'name' ,
        'room_type',
        'number_of_reviews' ,
        'review_scores_rating',
        'review_scores_value',
        'host_id' ,
        'host_name' ,
        'host_is_superhost',
        'host_listings_count' ,
        'neighbourhood',
        'property_type' ,
        'accommodates' ,
        'availability_30' ,
        'bathrooms_text',
        'bedrooms' ,
        'beds',
        'price' ,
        'minimum_nights',
        'maximum_nights',
        ]

listings_table = listings[col]

In [8]:
# Importing regex
import re

bath = ["bathrooms_text"]
for col in bath:
    listings_table[col] = listings_table[col].apply(lambda x: str(re.compile('[a-zA-Z]').sub('', str(x))))
    listings_table[col] = listings_table[col].apply(lambda x: str(re.compile(' ').sub('', str(x))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table[col] = listings_table[col].apply(lambda x: str(re.compile('[a-zA-Z]').sub('', str(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table[col] = listings_table[col].apply(lambda x: str(re.compile(' ').sub('', str(x))))


In [9]:
listings_table["bathrooms"] = pd.to_numeric(listings_table["bathrooms_text"],errors='coerce')
listings_table.drop(['bathrooms_text'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table["bathrooms"] = pd.to_numeric(listings_table["bathrooms_text"],errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table.drop(['bathrooms_text'], axis=1, inplace=True)


In [10]:
# The price fields in our data frame

listings_table['price'] = listings_table['price'].str.replace(',', '')
listings_table['price'] = listings_table['price'].str.replace('$', '')
listings_table['price'] = listings_table['price'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table['price'] = listings_table['price'].str.replace(',', '')
  listings_table['price'] = listings_table['price'].str.replace('$', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table['price'] = listings_table['price'].str.replace('$', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus

In [46]:
for col in ['bathrooms', 'bedrooms', 'beds']:
    listings[col].fillna(listings[col].median(), inplace=True)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [40]:
listings_table.dtypes

id                        int64
name                     object
room_type                object
number_of_reviews         int64
review_scores_rating    float64
review_scores_value     float64
host_id                   int64
host_name                object
host_is_superhost        object
host_listings_count     float64
neighbourhood            object
property_type            object
accommodates              int64
availability_30           int64
bedrooms                float64
beds                    float64
price                   float64
minimum_nights            int64
maximum_nights            int64
bathrooms               float64
dtype: object

In [37]:
listings_table.columns

Index(['id', 'name', 'room_type', 'number_of_reviews', 'review_scores_rating',
       'review_scores_value', 'host_id', 'host_name', 'host_is_superhost',
       'host_listings_count', 'neighbourhood', 'property_type', 'accommodates',
       'availability_30', 'bedrooms', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'bathrooms'],
      dtype='object')

In [38]:
listings_table.head()

Unnamed: 0,id,name,room_type,number_of_reviews,review_scores_rating,review_scores_value,host_id,host_name,host_is_superhost,host_listings_count,neighbourhood,property_type,accommodates,availability_30,bedrooms,beds,price,minimum_nights,maximum_nights,bathrooms
0,2818,Quiet Garden View Room & Super Fast WiFi,Private room,278,98.0,10.0,3159,Daniel,t,1.0,"Amsterdam, North Holland, Netherlands",Private room in apartment,2,19,1.0,2.0,59.0,3,1125,1.5
1,20168,Studio with private bathroom in the centre 1,Private room,339,89.0,9.0,59484,Alexander,f,2.0,"Amsterdam, North Holland, Netherlands",Private room in townhouse,2,0,1.0,1.0,149.0,1,365,1.0
2,25428,"Lovely, spacious 1 bed apt in Center(with lift)",Entire home/apt,5,100.0,10.0,56142,Joan,t,2.0,,Entire apartment,3,29,1.0,1.0,125.0,14,180,1.0
3,27886,"Romantic, stylish B&B houseboat in canal district",Private room,219,99.0,10.0,97647,Flip,t,1.0,"Amsterdam, North Holland, Netherlands",Private room in houseboat,2,1,1.0,1.0,123.0,2,730,1.5
4,28871,Comfortable double room,Private room,336,97.0,10.0,124245,Edwin,t,2.0,"Amsterdam, North Holland, Netherlands",Private room in apartment,2,27,1.0,1.0,75.0,2,1825,1.0


In [16]:
execute_values(conn, listings_table, table='listings')

Error: column "review_scores_rating" of relation "listings" does not exist
LINE 1: ...INTO listings(id,name,room_type,number_of_reviews,review_sco...
                                                             ^



1

In [None]:
cur.execute(listings_check)

for row in cur.fetchall():
    print(row)

# Calendar

In [54]:
col = ['listing_id',
        'date',
        'available',
        'adjusted_price']

calendar_table = calendar[col]

In [55]:
calendar['adjusted_price'] = calendar['adjusted_price'].str.replace(',', '')
calendar['adjusted_price'] = calendar['adjusted_price'].str.replace('$', '')
calendar['price'] = calendar['adjusted_price'].astype(float)

  calendar['adjusted_price'] = calendar['adjusted_price'].str.replace('$', '')


In [56]:
execute_values(conn, calendar_table, table='calendar')

InterfaceError: connection already closed

In [None]:
cur.execute(calendar_check)

for row in cur.fetchall():
    print(row)

# Review

In [None]:
col = ['listings_id',
        'review_id',
        'date',
        'comment']

review_table = review[col]

In [None]:
execute_values(conn, review_table, table='review')

In [None]:
cur.execute(review_check)

for row in cur.fetchall():
    print(row)

In [50]:
conn.close ()