In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
from pyspark.sql.types import * 

spark = SparkSession.builder.master('local').appName('airbnb') .getOrCreate()

df_airbnb_US = spark.read.format("csv").option("header","true").option("inferSchema","true").load('./airbnb_dataset.csv')

print("\n----- Schema Airbnb dataframe : -----"), 
df_airbnb_US.printSchema()


----- Schema Airbnb dataframe : -----
root
 |-- id: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- host id: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- host name: string (nullable = true)
 |-- neighbourhood group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- long: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country code: string (nullable = true)
 |-- instant_bookable: string (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- room type: string (nullable = true)
 |-- Construction year: string (nullable = true)
 |-- price: string (nullable = true)
 |-- service fee: string (nullable = true)
 |-- minimum nights: string (nullable = true)
 |-- number of reviews: string (nullable = true)
 |-- last review: string (nullable = true)
 |-- reviews per month: string (nullable = true)
 |-- review rate number: string (nullable = tr

In [9]:
df_airbnb_US_col_select = df_airbnb_US.select("host_identity_verified", "neighbourhood group", "neighbourhood", "instant_bookable", "cancellation_policy", "room type", "price", "Construction year", "minimum nights", "review rate number")

df_airbnb_US_col_select.printSchema();
df_airbnb_US_col_select.show(50); 

root
 |-- host_identity_verified: string (nullable = true)
 |-- neighbourhood group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- instant_bookable: string (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- room type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- Construction year: string (nullable = true)
 |-- minimum nights: string (nullable = true)
 |-- review rate number: string (nullable = true)

+----------------------+-------------------+--------------------+----------------+-------------------+---------------+-------+-----------------+--------------+------------------+
|host_identity_verified|neighbourhood group|       neighbourhood|instant_bookable|cancellation_policy|      room type|  price|Construction year|minimum nights|review rate number|
+----------------------+-------------------+--------------------+----------------+-------------------+---------------+-------+-----------------+--------------+-----

In [21]:
df_airbnb_US_col_select_ct = df_airbnb_US_col_select \
        .withColumn("instant_bookable" ,
                df_airbnb_US_col_select["instant_bookable"]
                .cast(BooleanType())) \
        .withColumn("minimum nights" ,
                df_airbnb_US_col_select["minimum nights"]
                .cast(LongType())) \
        .withColumn("review rate number" ,
                df_airbnb_US_col_select["review rate number"]
                .cast(LongType())) \
        .withColumn("Construction year" ,
                df_airbnb_US_col_select["Construction year"]
                .cast(LongType())) \
        .withColumn("price", 
                regexp_replace("price", '[$,]', '')
                .cast(LongType()))


In [22]:
df_airbnb_US_col_select_ct.printSchema();
df_airbnb_US_col_select_ct.show(50);

root
 |-- host_identity_verified: string (nullable = true)
 |-- neighbourhood group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- instant_bookable: boolean (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- room type: string (nullable = true)
 |-- price: long (nullable = true)
 |-- Construction year: long (nullable = true)
 |-- minimum nights: long (nullable = true)
 |-- review rate number: long (nullable = true)

+----------------------+-------------------+--------------------+----------------+-------------------+---------------+-----+-----------------+--------------+------------------+
|host_identity_verified|neighbourhood group|       neighbourhood|instant_bookable|cancellation_policy|      room type|price|Construction year|minimum nights|review rate number|
+----------------------+-------------------+--------------------+----------------+-------------------+---------------+-----+-----------------+--------------+------------------