In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pprint import pprint

import jq
import requests
from pyspark.sql import SparkSession, functions as F
from IPython.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
import os
import dotenv

dotenv.load_dotenv()
TOKEN = os.getenv("ACCESS_TOKEN")

headers = {
    'Authorization': f'Bearer {TOKEN}',
    'Content-Type': 'application/json'
}
response = requests.get('https://api.surveymonkey.com/v3/surveys',
                        headers=headers)

surveys = (
    jq
    .compile('.data | map({name: .id, value: .title}) | from_entries')
    .input(response.json())
    .all()[0]
)
pprint(dict(sorted(surveys.items())))

In [None]:
SURVEY_ID = None

!python scripts/get_survey.py --endpoint details --output data/$SURVEY_ID/details.json $SURVEY_ID
!python scripts/get_survey.py --endpoint responses --output data/$SURVEY_ID/responses.json $SURVEY_ID

In [None]:
!tree data

In [None]:
from spark_surveymonkey import _transform

spark = SparkSession.builder.appName('test').getOrCreate()

df_flatten = _transform.flatten(spark, f'data/{SURVEY_ID}/responses.json')
df_interpret = _transform.interpret(df_flatten, f'data/{SURVEY_ID}/details.json')
df_pivot = _transform.pivot(df_interpret)


In [None]:
(
    _transform.interpret(df_flatten, f'data/{SURVEY_ID}/details.json')
    .sort(
        'response_id',
        'page_idx',
        'question_idx',
        F.coalesce('choice_id', 'row_id', 'other_id')
    )
    .withColumn('heading', F.col('heading').substr(1, 50))
    .withColumn('row', F.col('row').substr(1, 50))
    .select(
        'response_id',
        'response_status',
        'page_idx',
        'question_idx',
        'question_id',
        'family',
        'heading',
        'row',
        'column',
        'value',
    )
    # .filter(F.col('value').isNull())  # should return 0 rows
    .show(100, False)
)


In [None]:
(
    _transform.pivot(df_interpret)
    .sort('response_id')
    .show(100, False)
)
pprint(df_pivot.columns)