In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
      .master("local[3]") \
      .appName("Japan Analysis") \
      .getOrCreate()
sc = spark.sparkContext
spark

In [3]:
# schemas
from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType, IntegerType, ArrayType

base = '/home/jovyan/code/airbnb_data/japan/'

calender_file = base + 'calendar.csv'


calender_schema = StructType(
    [StructField("listing_id",StringType()),
     StructField("date",DateType()),
     StructField("available",StringType()),
     StructField("act_price",StringType()),
     StructField("adj_price",StringType()),
     StructField("min_nights",IntegerType()),
     StructField("max_nights",IntegerType()),
     ]
)

# host_id	host_url	host_name	host_since	host_location	host_about	host_response_time	
# host_response_rate	host_acceptance_rate	host_is_superhost
# host_neighbourhood	host_listings_count (particular listing)	 
# host_total_listings_count	host_verifications	host_identity_verified

host_schema = StructType(
    [StructField("host_id",IntegerType()),
     StructField("host_url",StringType()),
     StructField("host_name",StringType()),
     StructField("host_since",DateType()),
     StructField("host_about",StringType()),
     StructField("host_response_time",StringType()),
     StructField("host_response_rate",StringType()),
     StructField("host_acceptance_rate",StringType()),
     StructField("host_is_superhost",StringType()),
     StructField("host_neighbourhood",StringType()),
     StructField("host_total_listings_count",IntegerType()),
     StructField("host_verifications",StringType()),
     StructField("host_identity_verified",StringType())
     ]
)
calender_schema
host_schema

StructType([StructField('host_id', IntegerType(), True), StructField('host_url', StringType(), True), StructField('host_name', StringType(), True), StructField('host_since', DateType(), True), StructField('host_about', StringType(), True), StructField('host_response_time', StringType(), True), StructField('host_response_rate', StringType(), True), StructField('host_acceptance_rate', StringType(), True), StructField('host_is_superhost', StringType(), True), StructField('host_neighbourhood', StringType(), True), StructField('host_total_listings_count', IntegerType(), True), StructField('host_verifications', StringType(), True), StructField('host_identity_verified', StringType(), True)])

In [4]:
"""
            Calculate the total number of available listings each day from the Calendar.csv.

"""
from pyspark.sql import DataFrame
from pyspark.sql.functions import col,regexp_replace, when


df: DataFrame = spark.read.csv(calender_file,schema =calender_schema, header=True)
                    
df = df.withColumn("act_price",
              regexp_replace(col("act_price"),"[$,.]","").cast("integer"))   
df = df.withColumn("adj_price",
              regexp_replace(col("adj_price"),"[$,.]","").cast("integer"))  
df = df.withColumn("available",
                   when(col("available")=='f',False)
                   .when(col("available")=='t',True)
                   .cast('boolean'))  
df.show(3)
new_df = df.groupBy(col("date")).pivot('available',values=[True,False]).count().sort(col('date'),ascending=False)
new_df.show(3)

+----------+----------+---------+---------+---------+----------+----------+
|listing_id|      date|available|act_price|adj_price|min_nights|max_nights|
+----------+----------+---------+---------+---------+----------+----------+
|    197677|2023-09-24|    false|  1200000|  1200000|         3|      1125|
|    197677|2023-09-25|    false|  1200000|  1200000|         3|      1125|
|    197677|2023-09-26|    false|  1200000|  1200000|         3|      1125|
+----------+----------+---------+---------+---------+----------+----------+
only showing top 3 rows

+----------+----+-----+
|      date|true|false|
+----------+----+-----+
|2024-09-22|2332| 9893|
|2024-09-21|2332| 9893|
|2024-09-20|2332| 9893|
+----------+----+-----+
only showing top 3 rows



In [7]:
# import csv

# with open(listing_file, mode='r') as csv_file:
#     csv_reader = csv.DictReader(csv_file)
#     with open('your_updated_file.csv', 'w') as new_file:
#         header = None
#         cells = []
#         for row in csv_reader:
#             amen_str = row.get('amenities')
#             split_line = amen_str.split(',')
#             updated_line = ''
#             for i in split_line:
#                 i = i.replace('[', '').replace(']', '').replace('"', '').strip()
#                 updated_line += f"{i}|"
#             row['amenities'] = updated_line
#             nf = csv.writer(new_file)
#             header = [list(row.keys())]
#             cells.append(list(row.values()))
#         nf.writerows( header )
#         nf.writerows( cells )
"""
        Find the top 10 hosts with the most number of listings from the Listings.csv.
"""
import pandas as pd
import pyspark.pandas as ps



listing_file = base + 'listings_cleaned.csv'

pandas_df = pd.read_csv(listing_file)         
  
host_df = ps.from_pandas(pandas_df).to_spark()




In [8]:
host_df.show(5)

+-------+--------------------+--------------+------------+---------------+--------------------+--------------------+---------------------+--------------------+--------+--------------------+-------------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+----------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+------------------

In [None]:
from pyspark.sql.functions import to_date
for i in host_schema:
    if isinstance( i.dataType ,DateType):
        host_df = host_df.withColumn(i.name, to_date(col(i.name)) )


host_df = host_df.select([col(i.name) for i in host_schema])
print(host_df.show(5))
print(host_df.printSchema())


+-------+--------------------+--------------+------------+---------------+--------------------+--------------------+---------------------+--------------------+--------+--------------------+-------------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+----------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+------------------

In [9]:
host_df.na.fill(value='0%',subset=['host_response_rate','host_acceptance_rate'])
host_df.show(5)

+-------+--------------------+--------------+------------+---------------+--------------------+--------------------+---------------------+--------------------+--------+--------------------+-------------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+----------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+------------------

In [10]:
#host_df.write.option("header",True).csv('/home/jovyan/code/airbnb_data/japan/host')