-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
303 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
3 changes: 3 additions & 0 deletions
3
hbase/day_2/MyFoodapediaData/Foods_Needing_Condiments_Table.xml
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
<Food_Display_Row> | ||
<Food_Code>12350000</Food_Code> | ||
<Display_Name>Sour cream dip</Display_Name> | ||
<Portion_Default>1.00000</Portion_Default> | ||
<Portion_Amount>.25000</Portion_Amount> | ||
<Portion_Display_Name>cup </Portion_Display_Name> | ||
<Factor>.25000</Factor> | ||
<Increment>.25000</Increment> | ||
<Multiplier>1.00000</Multiplier> | ||
<Grains>.04799</Grains> | ||
<Whole_Grains>.00000</Whole_Grains> | ||
<Vegetables>.04070</Vegetables> | ||
<Orange_Vegetables>.00000</Orange_Vegetables> | ||
<Drkgreen_Vegetables>.00000</Drkgreen_Vegetables> | ||
<Starchy_vegetables>.00000</Starchy_vegetables> | ||
<Other_Vegetables>.04070</Other_Vegetables> | ||
<Fruits>.00000</Fruits> | ||
<Milk>.00000</Milk> | ||
<Meats>.00000</Meats> | ||
<Soy>.00000</Soy> | ||
<Drybeans_Peas>.00000</Drybeans_Peas> | ||
<Oils>.00000</Oils> | ||
<Solid_Fats>105.64850</Solid_Fats> | ||
<Added_Sugars>1.57001</Added_Sugars> | ||
<Alcohol>.00000</Alcohol> | ||
<Calories>133.65000</Calories> | ||
<Saturated_Fats>7.36898</Saturated_Fats> | ||
</Food_Display_Row> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#--- | ||
# Excerpted from "Seven Databases in Seven Weeks", | ||
# published by The Pragmatic Bookshelf. | ||
# Copyrights apply to this code. It may not be used to create training material, | ||
# courses, books, articles, and the like. Contact us if you are in doubt. | ||
# We make no guarantees that this code is fit for any purpose. | ||
# Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information. | ||
#--- | ||
include java | ||
import 'org.apache.hadoop.hbase.client.HTable' | ||
import 'org.apache.hadoop.hbase.client.Put' | ||
import 'org.apache.hadoop.hbase.client.Scan' | ||
import 'org.apache.hadoop.hbase.util.Bytes' | ||
import 'org.apache.hadoop.hbase.HBaseConfiguration' | ||
|
||
def jbytes( *args ) | ||
return args.map { |arg| arg.to_s.to_java_bytes } | ||
end | ||
|
||
puts( @hbase ) | ||
conf = HBaseConfiguration.new | ||
wiki_table = HTable.new( conf, "wiki" ) | ||
links_table = HTable.new( conf, 'links' ) | ||
links_table.setAutoFlush( false ) | ||
|
||
scanner = wiki_table.getScanner( Scan.new ) # (1) | ||
|
||
linkpattern = /\[\[([^\[\]\|\:\#][^\[\]\|:]*)(?:\|([^\[\]\|]+))?\]\]/ | ||
count = 0 | ||
|
||
while (result = scanner.next()) | ||
|
||
title = Bytes.toString( result.getRow() ) # (2) | ||
text = Bytes.toString( result.getValue( *jbytes( 'text', '' ) ) ) | ||
if text | ||
|
||
put_to = nil | ||
text.scan(linkpattern) do |target, label| # (3) | ||
unless put_to | ||
put_to = Put.new( *jbytes( title ) ) | ||
put_to.setWriteToWAL( false ) | ||
end | ||
|
||
target.strip! | ||
target.capitalize! | ||
|
||
label = '' unless label | ||
label.strip! | ||
|
||
put_to.add( *jbytes( "to", target, label ) ) | ||
put_from = Put.new( *jbytes( target ) ) | ||
put_from.add( *jbytes( "from", title, label ) ) | ||
put_from.setWriteToWAL( false ) | ||
links_table.put( put_from ) # (4) | ||
end | ||
links_table.put( put_to ) if put_to # (5) | ||
links_table.flushCommits() | ||
|
||
end | ||
count += 1 | ||
puts "#{count} pages processed (#{title})" if count % 500 == 0 | ||
|
||
end | ||
links_table.flushCommits() | ||
exit | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
create 'foods' , 'facts' | ||
|
||
curl file:///home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/food-display-example.xml | cat | /opt/hbase/hbase-0.94.18/bin/hbase shell /home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/import_food_display.rb | ||
|
||
|
||
curl file:///home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/MyFoodapediaData/Food_Display_Table.xml | cat | /opt/hbase/hbase-0.94.18/bin/hbase shell /home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/import_food_display.rb | ||
|
||
|
||
|
||
% Total % Received % Xferd Average Speed Time Time Time Current | ||
Dload Upload Total Spent Left Speed | ||
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0#<Hbase::Hbase:0x1ee96f4> | ||
31 1795k 31 559k 0 0 37500 0 0:00:49 0:00:15 0:00:34 145k500 records inserted (Twix cookie bars) | ||
52 1795k 52 943k 0 0 56157 0 0:00:32 0:00:17 0:00:15 168k1000 records inserted (Honeydew melon (raw)) | ||
66 1795k 66 1199k 0 0 68018 0 0:00:27 0:00:18 0:00:09 195k1500 records inserted (Beef sirloin frozen meal) | ||
100 1795k 100 1795k 0 0 92349 0 0:00:19 0:00:19 --:--:-- 267k | ||
2000 records inserted (Fruity Pebbles cereal) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# bloom filter | ||
http://billmill.org/bloomfilter-tutorial/ | ||
http://en.wikipedia.org/wiki/Bloom_filter | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
include Java | ||
import 'org.apache.hadoop.hbase.client.HTable' | ||
import 'org.apache.hadoop.hbase.client.Put' | ||
import 'org.apache.hadoop.hbase.HBaseConfiguration' | ||
import 'javax.xml.stream.XMLStreamConstants' | ||
|
||
def jbytes( *args ) | ||
args.map { |arg| arg.to_s.to_java_bytes } | ||
end | ||
|
||
factory = javax.xml.stream.XMLInputFactory.newInstance | ||
reader = factory.createXMLStreamReader(java.lang.System.in) | ||
|
||
document = nil | ||
buffer = nil | ||
count = 0 | ||
|
||
puts( @hbase ) | ||
conf = HBaseConfiguration.new | ||
table = HTable.new( conf, "foods" ) | ||
table.setAutoFlush( false ) | ||
|
||
while reader.has_next | ||
type = reader.next | ||
|
||
if type == XMLStreamConstants::START_ELEMENT # (3) | ||
|
||
case reader.local_name | ||
when 'Food_Display_Row' then document = {} | ||
when /Display_Name|Portion_Default|Portion_Amount|Portion_Display_Name|Factor/ then buffer = [] | ||
when /Increment|Multiplier|Grains|Whole_Grains|Vegetables|Orange_Vegetables/ then buffer = [] | ||
when /Drkgreen_Vegetables|Starchy_vegetables|Other_Vegetables|Fruits|Milk|Meats/ then buffer = [] | ||
when /Drybeans_Peas|Soy|Oils|Solid_Fats|Added_Sugars|Alcohol|Calories|Saturated_Fats/ then buffer = [] | ||
end | ||
|
||
elsif type == XMLStreamConstants::CHARACTERS | ||
buffer << reader.text unless buffer.nil? | ||
|
||
elsif type == XMLStreamConstants::END_ELEMENT | ||
|
||
case reader.local_name | ||
when /Display_Name|Portion_Default|Portion_Amount|Portion_Display_Name|Factor/ | ||
document[reader.local_name] = buffer.join | ||
when /Increment|Multiplier|Grains|Whole_Grains|Vegetables|Orange_Vegetables/ | ||
document[reader.local_name] = buffer.join | ||
when /Drkgreen_Vegetables|Starchy_vegetables|Other_Vegetables|Fruits|Milk|Meats/ | ||
document[reader.local_name] = buffer.join | ||
when /Drybeans_Peas|Soy|Oils|Solid_Fats|Added_Sugars|Alcohol|Calories|Saturated_Fats/ | ||
document[reader.local_name] = buffer.join | ||
|
||
when 'Food_Display_Row' | ||
key = document['Display_Name'].to_java_bytes | ||
|
||
p = Put.new( key ) | ||
p.add( *jbytes( "facts", "Display_Name", document['Display_Name'] ) ) | ||
p.add( *jbytes( "facts", "Portion_Default", document['Portion_Default'] ) ) | ||
p.add( *jbytes( "facts", "Portion_Amount", document['Portion_Amount'] ) ) | ||
p.add( *jbytes( "facts", "Portion_Display_Name", document['Portion_Display_Name'] ) ) | ||
p.add( *jbytes( "facts", "Factor", document['Factor'] ) ) | ||
p.add( *jbytes( "facts", "Increment", document['Increment'] ) ) | ||
p.add( *jbytes( "facts", "Multiplier", document['Multiplier'] ) ) | ||
p.add( *jbytes( "facts", "Grains", document['Grains'] ) ) | ||
p.add( *jbytes( "facts", "Whole_Grains", document['Whole_Grains'] ) ) | ||
p.add( *jbytes( "facts", "Vegetables", document['Vegetables'] ) ) | ||
p.add( *jbytes( "facts", "Orange_Vegetables", document['Orange_Vegetables'] ) ) | ||
p.add( *jbytes( "facts", "Drkgreen_Vegetables", document['Drkgreen_Vegetables'] ) ) | ||
p.add( *jbytes( "facts", "Starchy_vegetables", document['Starchy_vegetables'] ) ) | ||
p.add( *jbytes( "facts", "Other_Vegetables", document['Other_Vegetables'] ) ) | ||
p.add( *jbytes( "facts", "Fruits", document['Fruits'] ) ) | ||
p.add( *jbytes( "facts", "Milk", document['Milk'] ) ) | ||
p.add( *jbytes( "facts", "Meats", document['Meats'] ) ) | ||
p.add( *jbytes( "facts", "Drybeans_Peas", document['Drybeans_Peas'] ) ) | ||
p.add( *jbytes( "facts", "Soy", document['Soy'] ) ) | ||
p.add( *jbytes( "facts", "Oils", document['Oils'] ) ) | ||
p.add( *jbytes( "facts", "Solid_Fats", document['Solid_Fats'] ) ) | ||
p.add( *jbytes( "facts", "Added_Sugars", document['Added_Sugars'] ) ) | ||
p.add( *jbytes( "facts", "Alcohol", document['Alcohol'] ) ) | ||
p.add( *jbytes( "facts", "Calories", document['Calories'] ) ) | ||
p.add( *jbytes( "facts", "Saturated_Fats", document['Saturated_Fats'] ) ) | ||
|
||
table.put( p ) | ||
|
||
count += 1 | ||
table.flushCommits() if count % 10 == 0 | ||
if count % 500 == 0 | ||
puts "#{count} records inserted (#{document['Display_Name']})" | ||
end | ||
end | ||
end | ||
end | ||
|
||
table.flushCommits() | ||
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#--- | ||
# Excerpted from "Seven Databases in Seven Weeks", | ||
# published by The Pragmatic Bookshelf. | ||
# Copyrights apply to this code. It may not be used to create training material, | ||
# courses, books, articles, and the like. Contact us if you are in doubt. | ||
# We make no guarantees that this code is fit for any purpose. | ||
# Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information. | ||
#--- | ||
|
||
require 'time' | ||
|
||
include Java | ||
import 'org.apache.hadoop.hbase.client.HTable' | ||
import 'org.apache.hadoop.hbase.client.Put' | ||
import 'org.apache.hadoop.hbase.HBaseConfiguration' | ||
import 'javax.xml.stream.XMLStreamConstants' | ||
|
||
def jbytes( *args ) | ||
args.map { |arg| arg.to_s.to_java_bytes } | ||
end | ||
|
||
factory = javax.xml.stream.XMLInputFactory.newInstance | ||
reader = factory.createXMLStreamReader(java.lang.System.in) | ||
|
||
document = nil # (1) | ||
buffer = nil | ||
count = 0 | ||
|
||
# table = HTable.new( @hbase.configuration, 'wiki' ) | ||
puts( @hbase ) | ||
conf = HBaseConfiguration.new | ||
table = HTable.new( conf, "wiki" ) | ||
table.setAutoFlush( false ) # (2) | ||
|
||
while reader.has_next | ||
type = reader.next | ||
|
||
if type == XMLStreamConstants::START_ELEMENT # (3) | ||
|
||
case reader.local_name | ||
when 'page' then document = {} | ||
when /title|timestamp|username|comment|text/ then buffer = [] | ||
end | ||
|
||
elsif type == XMLStreamConstants::CHARACTERS # (4) | ||
|
||
buffer << reader.text unless buffer.nil? | ||
|
||
elsif type == XMLStreamConstants::END_ELEMENT # (5) | ||
|
||
case reader.local_name | ||
when /title|timestamp|username|comment|text/ | ||
document[reader.local_name] = buffer.join | ||
when 'revision' | ||
key = document['title'].to_java_bytes | ||
ts = ( Time.parse document['timestamp'] ).to_i | ||
|
||
p = Put.new( key, ts ) | ||
p.add( *jbytes( "text", "", document['text'] ) ) | ||
p.add( *jbytes( "revision", "author", document['username'] ) ) | ||
p.add( *jbytes( "revision", "comment", document['comment'] ) ) | ||
table.put( p ) | ||
|
||
count += 1 | ||
table.flushCommits() if count % 10 == 0 | ||
if count % 500 == 0 | ||
puts "#{count} records inserted (#{document['title']})" | ||
end | ||
end | ||
end | ||
end | ||
|
||
table.flushCommits() | ||
exit | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 | bzcat | /opt/hbase/hbase-0.94.18/bin/hbase shell import_from_wikipedia.rb | ||
|
||
create 'links', {NAME => 'to', VERSIONS => 1, BLOOMFILTER => 'ROWCOL'},{NAME => 'from', VERSIONS => 1, BLOOMFILTER => 'ROWCOL'} | ||
|
||
|
||
count 'wiki', INTERVAL => 100000, CACHE => 10000 | ||
|