From 166a482035fdd7a5573c7574119ee4e311622743 Mon Sep 17 00:00:00 2001
From: matt winkler <mwinkle@microsoft.com>
Date: Thu, 21 Jun 2012 13:45:17 -0700
Subject: [PATCH] Updates to .pig file and readme's

---
 Pig/computeAirportDelays.pig | 11 ++++++-----
 README.md                    | 12 +++++++++++-
 basicStreaming/readme.md     | 23 +++++++++++++++++++++++
 3 files changed, 40 insertions(+), 6 deletions(-)
 create mode 100644 basicStreaming/readme.md

diff --git a/Pig/computeAirportDelays.pig b/Pig/computeAirportDelays.pig
index ea802bd..e19c217 100644
--- a/Pig/computeAirportDelays.pig
+++ b/Pig/computeAirportDelays.pig
@@ -1,7 +1,7 @@
 -- here is a basic pig script to read the data
 -- out of the airport file and compute average delay and then order them
 
-flights = LOAD 'fixed_flights' USING PigStorage(',') AS (arrDelayMinutes, carrier, dayOfMonth, depDelayMinutes, dest, flightDate, month, origin, rowId, year);
+flights = LOAD 'fixed_flights' USING PigStorage(',') AS (arrDelayMinutes:int, carrier, dayOfMonth, depDelayMinutes, dest, flightDate, month, origin, rowId, year);
 
 interestingData = FOREACH flights GENERATE dest, arrDelayMinutes;
 
@@ -11,12 +11,13 @@ destinationGroup = GROUP longDelays BY (dest);
 
 averages = FOREACH destinationGroup GENERATE group, COUNT(longDelays) as numberOfFlights, AVG(longDelays.arrDelayMinutes) as delay; 
 
-busyAirports = FILTER averages BY numberOfFlights > 1000;
+busyAirports = FILTER averages BY numberOfFlights > 5000;
 
-orderedDelays = ORDER busyAirports BY delay DESC; 
+--orderedDelays = ORDER busyAirports BY delay DESC; 
 
---top10 = LIMIT orderedDelays 10;
+--STORE orderedDelays INTO 'pigAverageDelays' USING PigStorage();
 
-STORE orderedDelays INTO 'top10PigLongDelays' USING PigStorage(); 
+
+STORE busyAirports INTO 'pigBusyLongDelays' USING PigStorage(); 
 
  
\ No newline at end of file
diff --git a/README.md b/README.md
index 275ce2b..6a59d0d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,14 @@
 TEE2012_HadoopDemos
 ===================
 
-TEE2012_HadoopDemos
\ No newline at end of file
+This is a set of demos used at TechEd Europe 2012
+
+Data Set assumptions
+===================
+
+The bulk of these demos operate on a set of flight delay information, originally obtained from the Azure DataMarket (available here: https://datamarket.azure.com/dataset/e29b7fb9-3d2e-4f35-8088-c97dbd75cd1f)
+
+We expect the following comma separated schema for these demo jobs:
+
+	ArrDelayMinutes Carrier DayofMonth DepDelayMinutes Dest FlightDate Month Origin RowId Year 
+
diff --git a/basicStreaming/readme.md b/basicStreaming/readme.md
new file mode 100644
index 0000000..54f2b2f
--- /dev/null
+++ b/basicStreaming/readme.md
@@ -0,0 +1,23 @@
+Basic Streaming
+======================
+
+This is a sample of basic map reduce based streaming using the streaming jar.
+
+Streaming i
+
+Build Instructions
+======================
+Build projects in VS
+
+or
+
+	msbuild basicNetStreaming.sln 
+
+should do the trick
+
+Execution Instructions
+======================
+From a Hadoop command prompt (note, this assumes that %HADOOP_HOME% is defined, the streaming jar has been built, and a full path to the executables:
+
+	hadoop jar %HADOOP_HOME%\lib\hadoop-streaming.jar -mapper f:\dev\src\csharp\hadoopdebugger\GenerateAirlineKeyMapper\bin\Debug\GenerateAirlineKeyMapper.exe -reducer f:\dev\src\csharp\hadoopdebugger\AirlineFlightCountReducer\bin\Debug\AirlineFlightCountReducer.exe -input fixed_flights -output test_streaming
+	
\ No newline at end of file