In [113]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType, FloatType, DateType, DoubleType
from pyspark.sql.functions import desc, asc, col, avg, round, sum, max, min, mean, count, filter, isnan, when
import pyspark.ml.feature as MF

In [114]:
spark = SparkSession.builder.getOrCreate()

In [115]:
df = spark.read.csv('listings.csv', header=True, inferSchema=True)

Casting columns data type


In [116]:
df = df.withColumn('reviews_per_month', col('reviews_per_month').cast(DoubleType()))
df = df.withColumn('longitude', col('longitude').cast(DoubleType()))

In [117]:
numeric_cols = [c for c,t in df.dtypes if t in ('double')]
string_cols = [c for c,t in df.dtypes if t in ('string')]

print(string_cols)

['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included',

Counts number of missing values per column

In [118]:
df[string_cols].select([count(when(col(c).isNull(), c)).alias(c) for c in df[string_cols].columns]).show()
df[numeric_cols].select([count(when(col(c).isNull(), c)).alias(c) for c in df[numeric_cols].columns]).show()

+---+-----------+---------+------------+----+-------+-----+-----------+-------------------+---------------------+-----+-------+-------------+----------+-----------+--------------+-------+--------+---------+----------+-------------+----------+------------------+------------------+--------------------+-----------------+------------------+----------------+------------------+-------------------+-------------------------+------------------+--------------------+----------------------+------+-------------+----------------------+----------------------------+----+-----+-------+------+--------------+------------+-------+--------+-----------------+-------------+---------+------------+---------+--------+----+--------+---------+-----------+-----+------------+-------------+----------------+------------+---------------+------------+--------------+--------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+--

Check specific column for missing values

In [119]:
df.select([count(when(col('host_name').isNull(),True))]).show()

+--------------------------------------------------+
|count(CASE WHEN (host_name IS NULL) THEN true END)|
+--------------------------------------------------+
|                                              2793|
+--------------------------------------------------+



Drop specific rows with missing values in specificed column

In [120]:
df = df.dropna(subset=['host_name'], how='any')
df.select([count(when(col('host_name').isNull(),True))]).show()

+--------------------------------------------------+
|count(CASE WHEN (host_name IS NULL) THEN true END)|
+--------------------------------------------------+
|                                                 0|
+--------------------------------------------------+



Replace null values in numeric columns with the mean

In [121]:
imputer = MF.Imputer(strategy='mean', inputCols=[c for c in df[numeric_cols].columns],  outputCols=[c for c in df[numeric_cols].columns])
model = imputer.fit(df)
df1 = model.transform(df)

In [122]:
df1.select([count(when(col('longitude').isNull(),True))]).show()
df1[numeric_cols].orderBy(asc('longitude')).show()

+--------------------------------------------------+
|count(CASE WHEN (longitude IS NULL) THEN true END)|
+--------------------------------------------------+
|                                                 0|
+--------------------------------------------------+

+-------------------+-----------------+
|          longitude|reviews_per_month|
+-------------------+-----------------+
| -122.4172188142562|99.97435318275154|
|-122.41636191405388|            100.0|
|-122.41584774912927|            100.0|
|-122.41545457464815|             10.0|
|-122.41541701923138|            365.0|
|-122.41502502060048|             10.0|
|-122.41380164048266|99.97435318275154|
|-122.41366735523944|             10.0|
| -122.4129798367683|             10.0|
|-122.41297509483515|99.97435318275154|
|-122.41166700268097|             10.0|
|-122.41147001291664|99.97435318275154|
|-122.41136943694207|99.97435318275154|
|-122.41108586541058|99.97435318275154|
|-122.41048671948457|99.97435318275154|
| -122.4101447

Replace all missing values for string cols

In [125]:
df.fillna('Missing_Data', subset=string_cols).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+--------------------+----------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------