Skip to content

Commit

Permalink
Fixes and simplification on Jupyter notebooks and README improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
gabrielpm-cit committed May 30, 2017
1 parent 0b9a2cf commit 5854ff5
Show file tree
Hide file tree
Showing 15 changed files with 405 additions and 7,957,989 deletions.
173 changes: 96 additions & 77 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dataflow_preprocess.py
Expand Up @@ -155,7 +155,7 @@ def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
train_coder = coders.ExampleProtoCoder(train_metadata.schema)
_ = (train_dataset
| 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
| 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter
#| 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter
| 'WriteTraining'
>> beam.io.WriteToTFRecord(
os.path.join(output_dir,
Expand Down
Expand Up @@ -2,14 +2,26 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#OUTPUT_BUCKET_FOLDER = \"gs://<GCS_BUCKET_NAME>/outbrain-click-prediction/output/\"\n",
"#DATA_BUCKET_FOLDER = \"gs://<GCS_BUCKET_NAME>/outbrain-click-prediction/data/\""
"OUTPUT_BUCKET_FOLDER = \"gs://<GCS_BUCKET_NAME>/outbrain-click-prediction/output/\"\n",
"DATA_BUCKET_FOLDER = \"gs://<GCS_BUCKET_NAME>/outbrain-click-prediction/data/\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.sql.types import *\n",
"import pyspark.sql.functions as F"
]
},
{
Expand All @@ -21,7 +33,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"collapsed": true
},
Expand All @@ -32,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"collapsed": true
},
Expand All @@ -55,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"collapsed": true
},
Expand All @@ -75,7 +87,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"collapsed": true
},
Expand All @@ -94,21 +106,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"clicks_train_joined_df = clicks_train_df \\\n",
" .join(promoted_content_df, on='ad_id', how='left') \\\n",
" .join(events_joined_df, on='display_id', how='left') \n",
" .join(events_df, on='display_id', how='left') \n",
"clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"collapsed": true
},
Expand All @@ -123,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"collapsed": true
},
Expand All @@ -137,7 +149,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"collapsed": true
},
Expand All @@ -146,13 +158,49 @@
"validation_set_gcs_output = \"validation_set.parquet\"\n",
"validation_set_df.write.parquet(OUTPUT_BUCKET_FOLDER+validation_set_gcs_output, mode='overwrite')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Row(display_id=2122, ad_id=36619, uuid_event='7ceed8e24a87d7', day_event=0, timestamp_event=148795, document_id_promo=899906, platform_event=3, geo_location_event='SG>00'),\n",
" Row(display_id=2122, ad_id=81643, uuid_event='7ceed8e24a87d7', day_event=0, timestamp_event=148795, document_id_promo=1094108, platform_event=3, geo_location_event='SG>00'),\n",
" Row(display_id=2122, ad_id=216100, uuid_event='7ceed8e24a87d7', day_event=0, timestamp_event=148795, document_id_promo=1548042, platform_event=3, geo_location_event='SG>00'),\n",
" Row(display_id=2659, ad_id=55819, uuid_event='964e40766c3f39', day_event=0, timestamp_event=185389, document_id_promo=986576, platform_event=3, geo_location_event='CA>BC'),\n",
" Row(display_id=2659, ad_id=76816, uuid_event='964e40766c3f39', day_event=0, timestamp_event=185389, document_id_promo=824972, platform_event=3, geo_location_event='CA>BC')]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"validation_set_df.take(5)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark",
"language": "python",
"name": "pyspark"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 5854ff5

Please sign in to comment.