### Inspect user data
Our analysis will be primarily focused on user reviews and tips, but it helps to quickly look at other files to verify whether this is data we'll need for sentiment analysis. There are no missing values, and I provide a schema to enforce reading the data in a consistent and efficient way using pyspark.

In [None]:
import pandas as pd
import os, json, pyarrow

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ShortType, TimestampType, FloatType, IntegerType, ArrayType
from pyspark.sql.functions import length, count, when, col

In [None]:
pd.set_option('display.max_colwidth', None) 

dataset_path = 'yelp_dataset'

spark = SparkSession.builder \
    .appName("YelpUsersAnalysis") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
# Read Reviews JSON in Spark with schema
schema = StructType([
    StructField('user_id', StringType(), False),
    StructField('name', StringType(), True),
    StructField('review_count', IntegerType(), True),
    StructField('yelping_since', TimestampType(), True),
    StructField('useful', IntegerType(), True),
    StructField('funny', IntegerType(), True),
    StructField('cool', IntegerType(), True),
    StructField('elite', StringType(), True),
    StructField('friends', StringType(), True),
    StructField('fans', StringType(), True),
    StructField('average_stars', FloatType(), True),
    StructField('compliment_hot', IntegerType(), True),
    StructField('compliment_more', IntegerType(), True),
    StructField('compliment_profile', IntegerType(), True),
    StructField('compliment_cute', IntegerType(), True),
    StructField('compliment_list', IntegerType(), True),
    StructField('compliment_note', IntegerType(), True),
    StructField('compliment_plain', IntegerType(), True),
    StructField('compliment_cool', IntegerType(), True),
    StructField('compliment_funny', IntegerType(), True),
    StructField('compliment_writer', IntegerType(), True),
    StructField('compliment_photos', IntegerType(), True)
])

users_df = spark.read.json(f'{dataset_path}/yelp_academic_dataset_user.json', multiLine=False, schema=schema)

In [None]:
users_df.columns

In [None]:
users_df.show(1, truncate=True)

In [None]:
print(users_df.count(), len(users_df.columns))

In [None]:
users_df.dtypes

#### No missing values

In [None]:
non_null_counts = users_df.select(
    [count(when(col(c).isNotNull(), c)).alias(c) for c in users_df.columns]
)

non_null_counts.toPandas().T