Skip to content

Commit

Permalink
hbase day 2 finished
Browse files Browse the repository at this point in the history
  • Loading branch information
eyalgo committed May 12, 2014
1 parent e89452f commit 2d527f0
Show file tree
Hide file tree
Showing 11 changed files with 303 additions and 7 deletions.
7 changes: 0 additions & 7 deletions hbase/day_1/material
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
# install tutorial
http://tutorialforlinux.com/2014/03/18/how-to-getting-started-with-apache-hbase-on-fedora-19-20-21-3264bit-linux-easy-guide/

# links and shortcuts
# need to do it differently
# sudo ln -sf /opt/hbase/hbase-0.94.18 /opt/hbase-latest
# sudo ln -s /opt/hbase-latest/bin/start-hbase.sh /usr/local/bin/hbase-start
# sudo ln -s /opt/hbase-latest/bin/stop-hbase.sh /usr/local/bin/hbase-stop
# sudo ln -s /opt/hbase-latest/bin/hbase /usr/local/bin/hbase

# running a jruby script
# The script is in current diretory
/opt/hbase-latest/bin/hbase org.jruby.Main put_multiple_columns.rb
Expand Down
3 changes: 3 additions & 0 deletions hbase/day_2/MyFoodapediaData/Food_Display_Table.xml

Large diffs are not rendered by default.

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions hbase/day_2/MyFoodapediaData/lu_Condiment_Food_Table.xml

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions hbase/day_2/food-display-example.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<Food_Display_Row>
<Food_Code>12350000</Food_Code>
<Display_Name>Sour cream dip</Display_Name>
<Portion_Default>1.00000</Portion_Default>
<Portion_Amount>.25000</Portion_Amount>
<Portion_Display_Name>cup </Portion_Display_Name>
<Factor>.25000</Factor>
<Increment>.25000</Increment>
<Multiplier>1.00000</Multiplier>
<Grains>.04799</Grains>
<Whole_Grains>.00000</Whole_Grains>
<Vegetables>.04070</Vegetables>
<Orange_Vegetables>.00000</Orange_Vegetables>
<Drkgreen_Vegetables>.00000</Drkgreen_Vegetables>
<Starchy_vegetables>.00000</Starchy_vegetables>
<Other_Vegetables>.04070</Other_Vegetables>
<Fruits>.00000</Fruits>
<Milk>.00000</Milk>
<Meats>.00000</Meats>
<Soy>.00000</Soy>
<Drybeans_Peas>.00000</Drybeans_Peas>
<Oils>.00000</Oils>
<Solid_Fats>105.64850</Solid_Fats>
<Added_Sugars>1.57001</Added_Sugars>
<Alcohol>.00000</Alcohol>
<Calories>133.65000</Calories>
<Saturated_Fats>7.36898</Saturated_Fats>
</Food_Display_Row>
66 changes: 66 additions & 0 deletions hbase/day_2/generate_wiki_links.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#---
# Excerpted from "Seven Databases in Seven Weeks",
# published by The Pragmatic Bookshelf.
# Copyrights apply to this code. It may not be used to create training material,
# courses, books, articles, and the like. Contact us if you are in doubt.
# We make no guarantees that this code is fit for any purpose.
# Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information.
#---
include java
import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.client.Scan'
import 'org.apache.hadoop.hbase.util.Bytes'
import 'org.apache.hadoop.hbase.HBaseConfiguration'

def jbytes( *args )
return args.map { |arg| arg.to_s.to_java_bytes }
end

puts( @hbase )
conf = HBaseConfiguration.new
wiki_table = HTable.new( conf, "wiki" )
links_table = HTable.new( conf, 'links' )
links_table.setAutoFlush( false )

scanner = wiki_table.getScanner( Scan.new ) # (1)

linkpattern = /\[\[([^\[\]\|\:\#][^\[\]\|:]*)(?:\|([^\[\]\|]+))?\]\]/
count = 0

while (result = scanner.next())

title = Bytes.toString( result.getRow() ) # (2)
text = Bytes.toString( result.getValue( *jbytes( 'text', '' ) ) )
if text

put_to = nil
text.scan(linkpattern) do |target, label| # (3)
unless put_to
put_to = Put.new( *jbytes( title ) )
put_to.setWriteToWAL( false )
end

target.strip!
target.capitalize!

label = '' unless label
label.strip!

put_to.add( *jbytes( "to", target, label ) )
put_from = Put.new( *jbytes( target ) )
put_from.add( *jbytes( "from", title, label ) )
put_from.setWriteToWAL( false )
links_table.put( put_from ) # (4)
end
links_table.put( put_to ) if put_to # (5)
links_table.flushCommits()

end
count += 1
puts "#{count} pages processed (#{title})" if count % 500 == 0

end
links_table.flushCommits()
exit

18 changes: 18 additions & 0 deletions hbase/day_2/home-work-do
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
create 'foods' , 'facts'

curl file:///home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/food-display-example.xml | cat | /opt/hbase/hbase-0.94.18/bin/hbase shell /home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/import_food_display.rb


curl file:///home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/MyFoodapediaData/Food_Display_Table.xml | cat | /opt/hbase/hbase-0.94.18/bin/hbase shell /home/eyalgo/seven-dbs-in-seven-weeks/hbase/day_2/import_food_display.rb



% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0#<Hbase::Hbase:0x1ee96f4>
31 1795k 31 559k 0 0 37500 0 0:00:49 0:00:15 0:00:34 145k500 records inserted (Twix cookie bars)
52 1795k 52 943k 0 0 56157 0 0:00:32 0:00:17 0:00:15 168k1000 records inserted (Honeydew melon (raw))
66 1795k 66 1199k 0 0 68018 0 0:00:27 0:00:18 0:00:09 195k1500 records inserted (Beef sirloin frozen meal)
100 1795k 100 1795k 0 0 92349 0 0:00:19 0:00:19 --:--:-- 267k
2000 records inserted (Fruity Pebbles cereal)

5 changes: 5 additions & 0 deletions hbase/day_2/home-work-find
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# bloom filter
http://billmill.org/bloomfilter-tutorial/
http://en.wikipedia.org/wiki/Bloom_filter


93 changes: 93 additions & 0 deletions hbase/day_2/import_food_display.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
include Java
import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.HBaseConfiguration'
import 'javax.xml.stream.XMLStreamConstants'

def jbytes( *args )
args.map { |arg| arg.to_s.to_java_bytes }
end

factory = javax.xml.stream.XMLInputFactory.newInstance
reader = factory.createXMLStreamReader(java.lang.System.in)

document = nil
buffer = nil
count = 0

puts( @hbase )
conf = HBaseConfiguration.new
table = HTable.new( conf, "foods" )
table.setAutoFlush( false )

while reader.has_next
type = reader.next

if type == XMLStreamConstants::START_ELEMENT # (3)

case reader.local_name
when 'Food_Display_Row' then document = {}
when /Display_Name|Portion_Default|Portion_Amount|Portion_Display_Name|Factor/ then buffer = []
when /Increment|Multiplier|Grains|Whole_Grains|Vegetables|Orange_Vegetables/ then buffer = []
when /Drkgreen_Vegetables|Starchy_vegetables|Other_Vegetables|Fruits|Milk|Meats/ then buffer = []
when /Drybeans_Peas|Soy|Oils|Solid_Fats|Added_Sugars|Alcohol|Calories|Saturated_Fats/ then buffer = []
end

elsif type == XMLStreamConstants::CHARACTERS
buffer << reader.text unless buffer.nil?

elsif type == XMLStreamConstants::END_ELEMENT

case reader.local_name
when /Display_Name|Portion_Default|Portion_Amount|Portion_Display_Name|Factor/
document[reader.local_name] = buffer.join
when /Increment|Multiplier|Grains|Whole_Grains|Vegetables|Orange_Vegetables/
document[reader.local_name] = buffer.join
when /Drkgreen_Vegetables|Starchy_vegetables|Other_Vegetables|Fruits|Milk|Meats/
document[reader.local_name] = buffer.join
when /Drybeans_Peas|Soy|Oils|Solid_Fats|Added_Sugars|Alcohol|Calories|Saturated_Fats/
document[reader.local_name] = buffer.join

when 'Food_Display_Row'
key = document['Display_Name'].to_java_bytes

p = Put.new( key )
p.add( *jbytes( "facts", "Display_Name", document['Display_Name'] ) )
p.add( *jbytes( "facts", "Portion_Default", document['Portion_Default'] ) )
p.add( *jbytes( "facts", "Portion_Amount", document['Portion_Amount'] ) )
p.add( *jbytes( "facts", "Portion_Display_Name", document['Portion_Display_Name'] ) )
p.add( *jbytes( "facts", "Factor", document['Factor'] ) )
p.add( *jbytes( "facts", "Increment", document['Increment'] ) )
p.add( *jbytes( "facts", "Multiplier", document['Multiplier'] ) )
p.add( *jbytes( "facts", "Grains", document['Grains'] ) )
p.add( *jbytes( "facts", "Whole_Grains", document['Whole_Grains'] ) )
p.add( *jbytes( "facts", "Vegetables", document['Vegetables'] ) )
p.add( *jbytes( "facts", "Orange_Vegetables", document['Orange_Vegetables'] ) )
p.add( *jbytes( "facts", "Drkgreen_Vegetables", document['Drkgreen_Vegetables'] ) )
p.add( *jbytes( "facts", "Starchy_vegetables", document['Starchy_vegetables'] ) )
p.add( *jbytes( "facts", "Other_Vegetables", document['Other_Vegetables'] ) )
p.add( *jbytes( "facts", "Fruits", document['Fruits'] ) )
p.add( *jbytes( "facts", "Milk", document['Milk'] ) )
p.add( *jbytes( "facts", "Meats", document['Meats'] ) )
p.add( *jbytes( "facts", "Drybeans_Peas", document['Drybeans_Peas'] ) )
p.add( *jbytes( "facts", "Soy", document['Soy'] ) )
p.add( *jbytes( "facts", "Oils", document['Oils'] ) )
p.add( *jbytes( "facts", "Solid_Fats", document['Solid_Fats'] ) )
p.add( *jbytes( "facts", "Added_Sugars", document['Added_Sugars'] ) )
p.add( *jbytes( "facts", "Alcohol", document['Alcohol'] ) )
p.add( *jbytes( "facts", "Calories", document['Calories'] ) )
p.add( *jbytes( "facts", "Saturated_Fats", document['Saturated_Fats'] ) )

table.put( p )

count += 1
table.flushCommits() if count % 10 == 0
if count % 500 == 0
puts "#{count} records inserted (#{document['Display_Name']})"
end
end
end
end

table.flushCommits()
exit
77 changes: 77 additions & 0 deletions hbase/day_2/import_from_wikipedia.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#---
# Excerpted from "Seven Databases in Seven Weeks",
# published by The Pragmatic Bookshelf.
# Copyrights apply to this code. It may not be used to create training material,
# courses, books, articles, and the like. Contact us if you are in doubt.
# We make no guarantees that this code is fit for any purpose.
# Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information.
#---

require 'time'

include Java
import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.HBaseConfiguration'
import 'javax.xml.stream.XMLStreamConstants'

def jbytes( *args )
args.map { |arg| arg.to_s.to_java_bytes }
end

factory = javax.xml.stream.XMLInputFactory.newInstance
reader = factory.createXMLStreamReader(java.lang.System.in)

document = nil # (1)
buffer = nil
count = 0

# table = HTable.new( @hbase.configuration, 'wiki' )
puts( @hbase )
conf = HBaseConfiguration.new
table = HTable.new( conf, "wiki" )
table.setAutoFlush( false ) # (2)

while reader.has_next
type = reader.next

if type == XMLStreamConstants::START_ELEMENT # (3)

case reader.local_name
when 'page' then document = {}
when /title|timestamp|username|comment|text/ then buffer = []
end

elsif type == XMLStreamConstants::CHARACTERS # (4)

buffer << reader.text unless buffer.nil?

elsif type == XMLStreamConstants::END_ELEMENT # (5)

case reader.local_name
when /title|timestamp|username|comment|text/
document[reader.local_name] = buffer.join
when 'revision'
key = document['title'].to_java_bytes
ts = ( Time.parse document['timestamp'] ).to_i

p = Put.new( key, ts )
p.add( *jbytes( "text", "", document['text'] ) )
p.add( *jbytes( "revision", "author", document['username'] ) )
p.add( *jbytes( "revision", "comment", document['comment'] ) )
table.put( p )

count += 1
table.flushCommits() if count % 10 == 0
if count % 500 == 0
puts "#{count} records inserted (#{document['title']})"
end
end
end
end

table.flushCommits()
exit



7 changes: 7 additions & 0 deletions hbase/day_2/material
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 | bzcat | /opt/hbase/hbase-0.94.18/bin/hbase shell import_from_wikipedia.rb

create 'links', {NAME => 'to', VERSIONS => 1, BLOOMFILTER => 'ROWCOL'},{NAME => 'from', VERSIONS => 1, BLOOMFILTER => 'ROWCOL'}


count 'wiki', INTERVAL => 100000, CACHE => 10000

0 comments on commit 2d527f0

Please sign in to comment.