In [1]:
import pandas as pd
import pyarrow as pa
from pyarrow import parquet as pq
from s3fs.core import S3FileSystem
import configparser
import plotly.graph_objs as go
import os

In [2]:
config = configparser.ConfigParser()
config.read_file(open(os.path.join('..', 'local.sparkconf.cfg')))

In [3]:
S3_ENDPOINT = config['S3']['spark.hadoop.fs.s3a.endpoint']
S3_ACCESS_KEY = config['S3']['spark.hadoop.fs.s3a.access.key']
S3_SECRET_KEY = config['S3']['spark.hadoop.fs.s3a.secret.key']

s3_fs = S3FileSystem(
    key=S3_ACCESS_KEY, 
    secret=S3_SECRET_KEY, 
    client_kwargs={
        'endpoint_url': S3_ENDPOINT,
        'verify': False
    }
)

In [4]:
paths = s3_fs.glob('s3://dutrajardim/udacity-dl-project/users.parquet/*/*.parquet')
users_table = pq.read_table(paths, filesystem=s3_fs, columns=['gender', 'user_id'])

In [5]:
df = users_table.to_pandas()
data = df.groupby('gender').count()['user_id'].values

In [6]:
fig = go.Figure(data=go.Bar(y=data))
fig.show()

In [5]:
songplays_dataset = pq.ParquetDataset('s3://dutrajardim/udacity-dl-project/songplays.parquet', filesystem=s3_fs, )
songplays_dataset.schema

<pyarrow._parquet.ParquetSchema object at 0x7f88506c4740>
required group field_id=-1 spark_schema {
  optional int96 field_id=-1 start_time;
  optional int32 field_id=-1 user_id;
  optional binary field_id=-1 level (String);
  optional binary field_id=-1 song_id (String);
  optional binary field_id=-1 artist_id (String);
  optional int32 field_id=-1 session_id;
  optional binary field_id=-1 location (String);
  optional binary field_id=-1 user_agent (String);
}

In [17]:
df_songplays = songplays_dataset.read(columns=['user_id', 'level']).to_pandas()
df_songplays

Unnamed: 0,user_id,level,year,month
0,26.0,free,2018,11
1,26.0,free,2018,11
2,26.0,free,2018,11
3,9.0,free,2018,11
4,12.0,free,2018,11
...,...,...,...,...
8051,10.0,free,2018,11
8052,26.0,free,2018,11
8053,26.0,free,2018,11
8054,26.0,free,2018,11
