In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import psycopg2
import psycopg2.extras
import pandas as pd

from create_table import init_database_and_tables

from sql_queries import (
    review_insert, review_check,
    calendar_insert, calendar_check,
    listings_insert, listings_check,
    reviewer_insert, reviewer_check,
    time_insert, time_check
)

psycopg2.__version__

'2.9.3 (dt dec pq3 ext lo64)'

In [10]:
import numpy as np
from psycopg2.extensions import register_adapter, AsIs

def addapt_numpy_array(numpy_array):
    return AsIs(tuple(numpy_array))

register_adapter(np.ndarray, addapt_numpy_array)

In [2]:
init_database_and_tables()

In [52]:
conn = psycopg2.connect("host=172.21.0.2 dbname=airbnbdb port=5432 user=airbnb password=airbnb")

print("PostgreSQL server information")
print(conn.get_dsn_parameters(), "\n")

cur = conn.cursor()

PostgreSQL server information
{'user': 'airbnb', 'channel_binding': 'prefer', 'dbname': 'airbnbdb', 'host': '172.21.0.2', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 



In [None]:
def execute_values(conn, df, table):
    """
    Using psycopg2.extras.execute_values() to insert the dataframe
    """
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES %%s ON CONFLICT DO NOTHING" % (table, cols)
    cursor = conn.cursor()
    try:
        psycopg2.extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("execute_values() done")
    cursor.close()

# Merge Data

In [31]:
listings_March = pd.read_csv("../../data/listings.csv")
listings_June = pd.read_csv("../../data/listings_1.csv")
listings_September = pd.read_csv("../../data/listings_2.csv")
listings_December = pd.read_csv("../../data/listings_3.csv")

calendar_March = pd.read_csv("../../data/calendar.csv")
calendar_June = pd.read_csv("../../data/calendar_1.csv")
calendar_September = pd.read_csv("../../data/calendar_2.csv")
calendar_December = pd.read_csv("../../data/calendar_3.csv")

reivew_March = pd.read_csv("../../data/reviews.csv")
reivew_June = pd.read_csv("../../data/reviews_1.csv")
reivew_September = pd.read_csv("../../data/reviews_2.csv")
reivew_December = pd.read_csv("../../data/reviews_3.csv")


listings_list = [listings_March, listings_June, listings_September, listings_December]
calendar_list = [calendar_March, calendar_June, calendar_September, calendar_December]
review_list = [reivew_March, reivew_June, reivew_September, reivew_December]

listings = pd.concat(listings_list)
calendar = pd.concat(calendar_list)
review = pd.concat(review_list)

In [57]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [8]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,451719,2021-03-05,t,$190.00,$190.00,3.0,60.0
1,230710,2021-03-05,f,$125.00,$125.00,14.0,60.0
2,230710,2021-03-06,f,$125.00,$125.00,14.0,60.0
3,230710,2021-03-07,f,$125.00,$125.00,14.0,60.0
4,230710,2021-03-08,f,$125.00,$125.00,14.0,60.0


In [9]:
review.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...


# Reviewer table

In [33]:
reviewer_table = review[["reviewer_id", "reviewer_name"]]
reviewer_table

Unnamed: 0,reviewer_id,reviewer_name
0,10952,Lam
1,12798,Alice
2,11869,Natalja
3,14064,Enrique
4,17977,Sherwin
...,...,...
272051,23491429,Marie-Eve
272052,234433792,Charles
272053,249850060,Emilien
272054,80402031,Jasna


In [34]:
reviewer_table.isna().sum()

reviewer_id      0
reviewer_name    0
dtype: int64

In [55]:
execute_values(conn, reviewer_table, table='reviewer')

execute_values() done


In [56]:
cur.execute(reviewer_check)

for row in cur.fetchall():
    print(row)

(10952, 'Lam')
(12798, 'Alice')
(11869, 'Natalja')
(14064, 'Enrique')
(17977, 'Sherwin')


# Time table

In [80]:
time_table = calendar[["date"]]
time_table["date"] = pd.to_datetime(time_table.date, format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time_table["date"] = pd.to_datetime(time_table.date, format='%Y-%m-%d')


In [82]:
t = time_table['date']
dt = t.dt

In [83]:
time_data = (t, dt.day, dt.month, dt.year, dt.isocalendar().week, dt.weekday)
column_labels = ("date_time", "day", "month", "year", "week","dayofweek")

In [84]:
time_df = pd.DataFrame({k: v for k, v in zip(column_labels, time_data)})
time_df.head()

Unnamed: 0,date_time,day,month,year,week,dayofweek
0,2021-03-05,5,3,2021,9,4
1,2021-03-05,5,3,2021,9,4
2,2021-03-06,6,3,2021,9,5
3,2021-03-07,7,3,2021,9,6
4,2021-03-08,8,3,2021,10,0


In [None]:
execute_values(conn, time_df, table='time')

In [None]:
cur.execute(time_check)

for row in cur.fetchall():
    print(row)

# Listings

In [None]:
col = [ 'id',
        'name' ,
        'room_type',
        'number_of_reviews' ,
        'host_id' ,
        'host_name' ,
        'host_is_superhost',
        'host_listings_count' ,
        'neighbourhood',
        'property_type' ,
        'accommodates' ,
        'availability_30' ,
        'bedrooms' ,
        'price' ,
        'minimum_nights',
        'maximum_nights']

listings_table = listings[col]

In [None]:
execute_values(conn, listings_table, table='listings')

In [None]:
cur.execute(listings_check)

for row in cur.fetchall():
    print(row)

# Calendar

In [None]:
col = [ 'listings_id',
        'date',
        'available',
        'price']

calendar_table = calendar[col]

In [None]:
execute_values(conn, calendar_table, table='calendar')

In [None]:
cur.execute(calendar_check)

for row in cur.fetchall():
    print(row)

# Review

In [None]:
col = [ 'listings_id',
        'review_id',
        'date',
        'comment']

review_table = review[col]

In [None]:
execute_values(conn, review_table, table='review')

In [None]:
cur.execute(review_table)

for row in cur.fetchall():
    print(row)

In [51]:
conn.close ()