In [1]:
from pprint import pprint
from pyspark.sql import SparkSession
from IPython.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

## Test Survey

In [2]:
SURVEY_ID = 284955270

display(HTML(
    "<table><tr>"
    f"<td><img src='data/{SURVEY_ID}/test-survey-1.jpg'></td>"
    f"<td><img src='data/{SURVEY_ID}/test-survey-2.jpg'></td>"
    "</tr></table>"
))

## Response JSON

In [3]:
# brew install tree jq
!tree data/$SURVEY_ID/responses
!jq . data/$SURVEY_ID/responses/`ls data/$SURVEY_ID/responses/ | head -1`

[01;34mdata/284955270/responses[00m
├── api.surveymonkey.net_v3_surveys_284955270_responses_bulk_?page=1&per_page=1.json
├── api.surveymonkey.net_v3_surveys_284955270_responses_bulk_?page=2&per_page=1.json
└── api.surveymonkey.net_v3_surveys_284955270_responses_bulk_?page=3&per_page=1.json

0 directories, 3 files
[1;39m{
  [0m[34;1m"data"[0m[1;39m: [0m[1;39m[
    [1;39m{
      [0m[34;1m"id"[0m[1;39m: [0m[0;32m"11670854001"[0m[1;39m,
      [0m[34;1m"recipient_id"[0m[1;39m: [0m[0;32m""[0m[1;39m,
      [0m[34;1m"collection_mode"[0m[1;39m: [0m[0;32m"default"[0m[1;39m,
      [0m[34;1m"response_status"[0m[1;39m: [0m[0;32m"completed"[0m[1;39m,
      [0m[34;1m"custom_value"[0m[1;39m: [0m[0;32m""[0m[1;39m,
      [0m[34;1m"first_name"[0m[1;39m: [0m[0;32m""[0m[1;39m,
      [0m[34;1m"last_name"[0m[1;39m: [0m[0;32m""[0m[1;39m,
      [0m[34;1m"email_address"[0m[1;39m: [0m[0;32m""[0m[1;39m,
      [0m[34;1m"ip_address"[0m[1;

## Response `DataFrame`

In [4]:
from spark_surveymonkey import transform_survey

spark = SparkSession.builder.appName('example').getOrCreate()
survey_responses = f'data/{SURVEY_ID}/responses/'
survey_details = f'data/{SURVEY_ID}/details.json'

df = transform_survey(spark, survey_responses, survey_details)

df.show()
pprint(df.columns)

+-----------+------------+---------------+---------------+------------+----------+---------+-------------+--------------+--------+---------+------------+---------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+---------------------+------------------------------------------+-------------------------------------+-----------------------------------+-------------------------------------+-----------------------------------------+-----------------------------------------------+---------------------------------------+---------------------------------------------+---------------------------------------------------------+-------------------------------------------------------------+-----------------------------------------------------------+----------------------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+------------

### Step by Step Transformations

In [5]:
from spark_surveymonkey import _transform

print("Flatten JSON -> DataFrame, with expanded nodes for pages, questions, answers")
df = _transform.flatten(spark, f'data/{SURVEY_ID}/responses/')
df.show(10)

print("Interpret questions/answers using details.json and infer structure")
df = _transform.interpret(df, f'data/{SURVEY_ID}/details.json')
df.show(10)

print("Pivot to wide DataFrame (1 row per respondent)")
df = _transform.pivot(df)
df.show()

Flatten JSON -> DataFrame, with expanded nodes for pages, questions, answers
+-----------+------------+---------------+---------------+------------+----------+---------+-------------+--------------+--------+---------+------------+---------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+---------+-----------+----------+----------+------+----------+--------+--------------------+
|response_id|recipient_id|collection_mode|response_status|custom_value|first_name|last_name|email_address|    ip_address|metadata|page_path|collector_id|survey_id|            edit_url|         analyze_url|total_time|       date_modified|        date_created|                href|  page_id|question_id| choice_id|    row_id|col_id|  other_id|tag_data|                text|
+-----------+------------+---------------+---------------+------------+----------+---------+-------------+--------------+--------+---------+------------+---------+----------------

+-----------+------------+---------------+---------------+------------+----------+---------+-------------+--------------+--------+---------+------------+---------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+---------------------+------------------------------------------+-------------------------------------+-----------------------------------+-------------------------------------+-----------------------------------------+-----------------------------------------------+---------------------------------------+---------------------------------------------+---------------------------------------------------------+-------------------------------------------------------------+-----------------------------------------------------------+----------------------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+------------