Permalink
Browse files

Merge pull request #3 from samirahmed/master

restructured Zippopotamus Crowdsourcing.  Filtered out large binaries from repository
  • Loading branch information...
2 parents 62f8c1c + 3fae5ba commit bad873377c0448c4f7bcc1f6bbf360063260f9fe @samirahmed samirahmed committed May 2, 2012
Showing with 261 additions and 289 deletions.
  1. +70 −0 appendix/country.txt
  2. +1 −0 appendix/geonames-headers.txt
  3. +1 −0 appendix/headers.txt
  4. +0 −70 raw/country.txt
  5. +0 −1 raw/headers.txt
  6. +4 −0 raw/sample.txt
  7. +59 −0 scripts/add.py
  8. +26 −0 scripts/export.bash
  9. +0 −114 scripts/file.py
  10. +0 −66 scripts/france.py
  11. +66 −0 scripts/remove.py
  12. +34 −38 scripts/table.py
View
@@ -0,0 +1,70 @@
+AD,Andorra
+AR,Argentina
+AS,American Samoa
+AT,Austria
+AU,Australia
+BD,Bangladesh
+BE,Belgium
+BG,Bulgaria
+BR,Brazil
+CA,Candada
+CH,Switzerland
+CZ,Czech Republic
+DE,Germany
+DK,Denmark
+DO,Dominican Republic
+ES,Spain
+FI,Finland
+FO,Faroe Islands
+FR,France
+GB,Great Britain
+GF,French Guyana
+GG,Guernsey
+GL,Greenland
+GP,Guadeloupe
+GT,Guatemala
+GU,Guam
+GY,Guyana
+HR,Croatia
+HU,Hungary
+IM,Isle of Man
+IN,India
+IS,Iceland
+IT,Italy
+JE,Jersey
+JP,Japan
+LI,Liechtenstein
+LK,Sri Lanka
+LT,Lithuania
+LU,Luxembourg
+MC,Monaco
+MD,Moldavia
+MH,Marshall Islands
+MK,Macedonia
+MP,Northern Mariana Islands
+MQ,Martinique
+MX,Mexico
+MY,Malaysia
+NL,Holland
+NO,Norway
+NZ,New Zealand
+PH,Phillippines
+PK,Pakistan
+PL,Poland
+PM,Saint Pierre and Miquelon
+PR,Puerto Rico
+PT,Portugal
+RE,French Reunion
+RU,Russia
+SE,Sweden
+SI,Slovenia
+SJ,Svalbard & Jan Mayen Islands
+SK,Slovak Republic
+SM,San Marino
+TH,Thailand
+TR,Turkey
+US,United States
+VA,Vatican
+VI,Virgin Islands
+YT,Mayotte
+ZA,South Africa
@@ -0,0 +1 @@
+country abbreviation,post code,place name,state,state abbreviation,ignore1,ignore2,ignore3,ignore4,latitude,longitude,ignore5
View
@@ -0,0 +1 @@
+country,country abbreviation,post code,place name,state,state abbreviation,latitude,longitude
View
@@ -1,70 +0,0 @@
-AD Andorra
-AR Argentina
-AS American Samoa
-AT Austria
-AU Australia
-BD Bangladesh
-BE Belgium
-BG Bulgaria
-BR Brazil
-CA Candada
-CH Switzerland
-CZ Czech Republic
-DE Germany
-DK Denmark
-DO Dominican Republic
-ES Spain
-FI Finland
-FO Faroe Islands
-FR France
-GB Great Britain
-GF French Guyana
-GG Guernsey
-GL Greenland
-GP Guadeloupe
-GT Guatemala
-GU Guam
-GY Guyana
-HR Croatia
-HU Hungary
-IM Isle of Man
-IN India
-IS Iceland
-IT Italy
-JE Jersey
-JP Japan
-LI Liechtenstein
-LK Sri Lanka
-LT Lithuania
-LU Luxembourg
-MC Monaco
-MD Moldavia
-MH Marshall Islands
-MK Macedonia
-MP Northern Mariana Islands
-MQ Martinique
-MX Mexico
-MY Malaysia
-NL Holland
-NO Norway
-NZ New Zealand
-PH Phillippines
-PK Pakistan
-PL Poland
-PM Saint Pierre and Miquelon
-PR Puerto Rico
-PT Portugal
-RE French Reunion
-RU Russia
-SE Sweden
-SI Slovenia
-SJ Svalbard & Jan Mayen Islands
-SK Slovak Republic
-SM San Marino
-TH Thailand
-TR Turkey
-US United States
-VA Vatican
-VI Virgin Islands
-YT Mayotte
-ZA South Africa
View
@@ -1 +0,0 @@
-country,post code,place name,state,state abbreviation,ignore1,ignore2,ignore3,ignore4,latitude,longitude
View
@@ -0,0 +1,4 @@
+country,country abbreviation,post code,place name,state,state abbreviation,latitude,longitude
+"Atlantis","AL","ATL01","Atlantis City","Atlantic State","AS","44.5","2.5667"
+"Atlantis","AL","ATL02","Metropolis","District of Comics","DC","29.4667","4.5"
+"Atlantis","AL","ATL04","Gotham City","District of Comics","DC","18.5","14.5"
View
@@ -0,0 +1,59 @@
+import sys
+import os
+import csv
+
+from pymongo import Connection
+from pymongo.database import Database
+
+appendix_file = '../appendix/country.txt'
+header_file = '../appendix/headers.txt'
+
+
+def add(csv_file):
+
+ # Generate N Records
+ with open( csv_file , 'rb') as f:
+ reader = csv.reader( f , delimiter=',', quotechar='"')
+ headers = reader.next()
+ for row in reader:
+ record = dict()
+
+ for ii in range(0, len(headers)):
+ if 'ignore' not in headers[ii]:
+ record[ headers[ii] ] = unicode( row[ii], 'utf-8' )
+ save(record)
+
+ pass
+
+def save( record ):
+ db['global'].save(record)
+
+if __name__ == "__main__" :
+
+ if ( len(sys.argv) != 5 ):
+ print "Usage: %s <username> <password> <url> <csv-files>" % sys.argv[0]
+
+ username = sys.argv[1]
+ password = sys.argv[2]
+ url = sys.argv[3]
+ file_list = sys.argv[4:]
+
+ print username
+ print password
+ print len(file_list)
+
+ if ( len(url) == 0 ):
+ connection = Connection() # Connect to localhost
+ else:
+ connection = Connection( url ) # Connect to remote db
+
+ db = Database(connection,'zip') # Get zip database
+ db.authenticate(username,password) # Authenticate
+
+
+ for csv_file in file_list: # Add all the files
+ add( csv_file )
+
+
+
+
View
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Check that we have at least n arguments
+
+if [ $# -ne 4 ]
+then
+ echo "Please enter <append> <raw-dump-dir> <user> <pass>"
+ exit -1
+fi
+
+
+# Get directory and name
+appendix=$1
+raw=$2
+user=$3
+pass=$4
+
+echo "Dumping $appendix into $raw.txt"
+
+while read line
+do
+ echo -e "$line \n"
+ name=${line:0:2}
+ mongoexport --db zip --collection global -u $user -p $pass -q "{'country abbreviation':'$name'}" -f 'country','country abbreviation','post code','place name','state','state abbreviation','latitude','longitude' --csv -o $raw$name.txt
+done < $appendix
+
View
@@ -1,114 +0,0 @@
-from __future__ import with_statement
-import codecs
-import csv
-import sys
-import json
-import os
-
-from contextlib import closing
-from zipfile import ZipFile, ZIP_DEFLATED
-import os
-
-
-'''
-ZIP Directory Helper Function
-'''
-def zipdir(basedir, archivename):
- assert os.path.isdir(basedir)
- with closing(ZipFile(archivename, "w", ZIP_DEFLATED)) as z:
-
- # traverse directory recursively
- for root, dirs, files in os.walk(basedir):
- #ignores empty directories
- for fn in files:
- absfn = os.path.join(root, fn)
- zfn = absfn[len(basedir)+len(os.sep):] #XXX: relative path
- z.write(absfn, zfn)
-
-'''
-Picks out all the directories and zips them
-'''
-def make_zip( countries ):
-
- print "Zipping folders"
-
- # for all the country codes
- for cc in countries :
- # make a name for the file
- zipname = cc+".zip"
- directory = os.path.join(os.getcwd(),cc)
-
- # ZIP all the folders into one
- zipdir(directory,zipname)
-
- # Print 10 to a line
- count+=1
- sys.stdout.write(cc+" ")
- if not count%10 :
- print ""
-
- pass
-
-
-'''
-Made specifically for GEONAMES.ORG postal code data parsing
-'''
-def main():
-
- if len(sys.argv) <3:
- print "Usage: "+sys.argv[0]+" <csv-file> <header-file>"
- sys.exit(-1)
-
- headerfile = sys.argv[2]
- csvfile = sys.argv[1]
-
- # get the headers, COMMA delimited
- hfile = csv.reader( open(headerfile, 'rb'), delimiter=',', quotechar='|' )
- headers = hfile.next()
-
- # Print list of valid header terms
- print filter( lambda hh: "ignore" not in hh , headers )
-
- # Read the TAB delimited file
- reader =csv.reader(open(csvfile, 'rb'), delimiter='\t', quotechar='|')
-
- # Keep track of country changes
- countries = set()
-
- print "Generating Files ... "
-
- for row in reader :
-
- # If not empty
- if row[0] != '':
- cc = row[0].lower()
- output_dir = os.path.join(os.getcwd(), cc)
- if not os.path.exists(output_dir): os.makedirs(output_dir);
-
- # Print if we have moved onto a new country country
- if cc not in countries:
- countries.add(cc)
- sys.stdout.write(cc+ " ")
- if not len(countries)%10 :
- print ""
-
- postcode = row[1]
- index = dict();
-
- # Populate information
- for ii in range(0,len(headers)):
- if 'ignore' not in headers[ii]:
- index[ headers[ii] ] = unicode(row[ii], 'utf-8');
-
- raw = json.dumps(index,ensure_ascii=False);
- fout = codecs.open(os.path.join(output_dir,postcode), encoding='utf-8', mode="w+" )
- fout.write( raw )
- fout.close()
-
- make_zip(countries)
-
-
-
-
-main()
-
Oops, something went wrong.

0 comments on commit bad8733

Please sign in to comment.