From 166a482035fdd7a5573c7574119ee4e311622743 Mon Sep 17 00:00:00 2001 From: matt winkler Date: Thu, 21 Jun 2012 13:45:17 -0700 Subject: [PATCH] Updates to .pig file and readme's --- Pig/computeAirportDelays.pig | 11 ++++++----- README.md | 12 +++++++++++- basicStreaming/readme.md | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 basicStreaming/readme.md diff --git a/Pig/computeAirportDelays.pig b/Pig/computeAirportDelays.pig index ea802bd..e19c217 100644 --- a/Pig/computeAirportDelays.pig +++ b/Pig/computeAirportDelays.pig @@ -1,7 +1,7 @@ -- here is a basic pig script to read the data -- out of the airport file and compute average delay and then order them -flights = LOAD 'fixed_flights' USING PigStorage(',') AS (arrDelayMinutes, carrier, dayOfMonth, depDelayMinutes, dest, flightDate, month, origin, rowId, year); +flights = LOAD 'fixed_flights' USING PigStorage(',') AS (arrDelayMinutes:int, carrier, dayOfMonth, depDelayMinutes, dest, flightDate, month, origin, rowId, year); interestingData = FOREACH flights GENERATE dest, arrDelayMinutes; @@ -11,12 +11,13 @@ destinationGroup = GROUP longDelays BY (dest); averages = FOREACH destinationGroup GENERATE group, COUNT(longDelays) as numberOfFlights, AVG(longDelays.arrDelayMinutes) as delay; -busyAirports = FILTER averages BY numberOfFlights > 1000; +busyAirports = FILTER averages BY numberOfFlights > 5000; -orderedDelays = ORDER busyAirports BY delay DESC; +--orderedDelays = ORDER busyAirports BY delay DESC; ---top10 = LIMIT orderedDelays 10; +--STORE orderedDelays INTO 'pigAverageDelays' USING PigStorage(); -STORE orderedDelays INTO 'top10PigLongDelays' USING PigStorage(); + +STORE busyAirports INTO 'pigBusyLongDelays' USING PigStorage(); \ No newline at end of file diff --git a/README.md b/README.md index 275ce2b..6a59d0d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,14 @@ TEE2012_HadoopDemos =================== -TEE2012_HadoopDemos \ No newline at end of file +This is a set of demos used at TechEd Europe 2012 + +Data Set assumptions +=================== + +The bulk of these demos operate on a set of flight delay information, originally obtained from the Azure DataMarket (available here: https://datamarket.azure.com/dataset/e29b7fb9-3d2e-4f35-8088-c97dbd75cd1f) + +We expect the following comma separated schema for these demo jobs: + + ArrDelayMinutes Carrier DayofMonth DepDelayMinutes Dest FlightDate Month Origin RowId Year + diff --git a/basicStreaming/readme.md b/basicStreaming/readme.md new file mode 100644 index 0000000..54f2b2f --- /dev/null +++ b/basicStreaming/readme.md @@ -0,0 +1,23 @@ +Basic Streaming +====================== + +This is a sample of basic map reduce based streaming using the streaming jar. + +Streaming i + +Build Instructions +====================== +Build projects in VS + +or + + msbuild basicNetStreaming.sln + +should do the trick + +Execution Instructions +====================== +From a Hadoop command prompt (note, this assumes that %HADOOP_HOME% is defined, the streaming jar has been built, and a full path to the executables: + + hadoop jar %HADOOP_HOME%\lib\hadoop-streaming.jar -mapper f:\dev\src\csharp\hadoopdebugger\GenerateAirlineKeyMapper\bin\Debug\GenerateAirlineKeyMapper.exe -reducer f:\dev\src\csharp\hadoopdebugger\AirlineFlightCountReducer\bin\Debug\AirlineFlightCountReducer.exe -input fixed_flights -output test_streaming + \ No newline at end of file