Skip to content
This repository has been archived by the owner on Sep 9, 2020. It is now read-only.

Commit

Permalink
test coverage for encoding and format detection
Browse files Browse the repository at this point in the history
  • Loading branch information
mckaymatt committed Jun 9, 2016
1 parent 1480c0d commit aaf93aa
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 9 deletions.
5 changes: 2 additions & 3 deletions datarobot_batch_scoring/batch_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def _check_for_multiline_input(self, peek_size=100):
' -- dont use flag `--fast` '
'to force CSV parsing. '
'Note that this will slow down scoring.')
sys.exit(0)


class SlowReader(CSVReader):
Expand Down Expand Up @@ -252,7 +251,7 @@ def investigate_encoding_and_dialect(self):
Providing a delimiter may help with smaller datasets.
Running this is costly so run it once per dataset."""
if self.encoding and self.dialect:
return self.encoding, self.dialect
return (self.encoding, self.dialect)
if self.dataset.endswith('.gz'):
opener = gzip.open
else:
Expand Down Expand Up @@ -802,7 +801,7 @@ def run_batch_predictions(base_url, base_headers, user, pwd,

base_headers['content-type'] = 'text/csv; charset=utf8'
endpoint = base_url + '/'.join((pid, lid, 'predict'))
encoding, dialect = BatchGenerator(
(encoding, dialect) = BatchGenerator(
dataset, 1, 1, delimiter, ui, fast_mode
).investigate_encoding_and_dialect()
# Make a sync request to check authentication and fail early
Expand Down
8 changes: 3 additions & 5 deletions datarobot_batch_scoring/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,7 @@ def _configure_logging(self, level, stdout):

# root logger
fs = '%(asctime)-15s [%(levelname)s] %(message)s'
if stdout:
hdlr = logging.StreamHandler(sys.stdout)
else:
hdlr = logging.FileHandler(self.root_logger_filename, 'w+')
hdlr = logging.FileHandler(self.root_logger_filename, 'w+')
dfs = None
fmt = logging.Formatter(fs, dfs)
hdlr.setFormatter(fmt)
Expand Down Expand Up @@ -123,7 +120,8 @@ def fatal(self, msg):
logger.error(msg)
exc_info = sys.exc_info()
root_logger.error(msg, exc_info=exc_info)
sys.exit(1)
self.close()
os._exit(1)

def getpass(self):
if self._prompt is not None:
Expand Down
2 changes: 2 additions & 0 deletions tests/fixtures/unparsable.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
,1 ,2 ,3 ,4
,a,,b,c,d
52 changes: 52 additions & 0 deletions tests/fixtures/windows_encoded.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
polarity,text
0,arhhghgh
4,"@karenrobinovitz yeah that wasn't just for one market - that was for a syndicated network called Daytime TV which airs in 100 markets "
0,"for some reason I can't upload a photo "
0,"Google going down means DoubleClick goes down too means I can't get any work done. "
4,@evilrobert you're in the group
4,@stevenmitchellw awe! he'll be so happy! yes he is a sweetie. I'm the girl with the fuzzy bear ears!
4,@mileycyrus; well done Hope you win
0,Wondering what movies to rent tonight have to work first thing in the morning
4,http://twitpic.com/6f71g - Please comment!
0,"@MathieuTO ditto. My bed is so comfy but my room is cold and loud and not downtown "
0,@RodneyQuarcoo not yet my internet has been a bit spotty these past few days..i'm stuck downloading at 2kBs....
0,@skakid25 Sadly for me I've been preparing for killer robots/zombies instead of nukes. Picked the wrong doomsday scenario apparently
4,you guys rock i love ur songs (especially guilty pleasure) @vickytcobra @GabrielSaporta
4,@StoryofMe it will be great fun - *that* is what life is all about.
0,It would be that Dover race that they played on Classic.
4,Okay - really have to get some housework done - LOL Have a wonderful Sunday afternoon all. I wish you all SAFE passage into the week
0,@iiamsliim yeah I'd do that than some shit like the "Ricky Bobby"
4,"@gs_stark Е�?ли вы — один из ты�?�?чи первых зареги�?трированных на Ruphotography то получите раннее приглашение на 500px "
4,""The business" on NPR. Best entertainment podcast and free to download on itunes "
0,@crystalyssaling i was like oh shiz. mr boettcher told us it might be on the test & he gave us info shts about it
0,@mgmyself hey what's up!! nobody wants to talk to me tonight.
0,"frustrated with the other mini-games in plants vs zombies. im not able to finish zombies eat me. waaaaaah "
0,miguel knows about cars more than I do.
4,is in the car on the way back to tenn. ill be back soon tho
4,@meghunt and you are not imaginary. I told will that you really existed in the real.
0,@mitchelmusso i guess you get loads of emails & people asking you stuff al the time but i seriously wonder if you ever read my comments x
4,"i've been up since like 5 just listening to my ipod and for once im not tired. i can go for an iced coffee from tim's y/y? "
4,@RAWedAwake Where are you moving in MO? I'm an hour north of St. Louis. Just wondering
4,@hello_jodie Are you okay Jodie? Love that acoustic version by the way!
4,it's morningggggg so wake the hell up
4,Big Brothers on now
0,goodnght tweeties who don't pay attention to anything i write! byeeeeeeeeee
4,"@Matz_Enig no! enjoy it while it lasts! happy bday btw hope the day turns out great and the party is even better"
4,"I think I have magical powers. Whenever I randomly download an app at the BOTTOM of the store it's in the top 50 within 48 hours! "
4,is gona lay down with his @Keao awww
4,is cooking eggs and bacon for supper...for my baby boy.
4,"They aren't monks! They are humans." hahaha
0,"It's been a long day at work and my hair is a mess. i don't think Cute Coworker Guy is interested. "
0,@majornelson You are so lucky Will they release them early for a public beta? Like with the NXE? That would be pretty cool.
0,*sigh* even heros need sleep I guess. Even if I don't wanna sleep
4,@DonnieWahlberg did you like my logo? Waffle house of love......
0,No plans 4 today and its raining!! I might read a book.. Ew!!
4,@thekush haha!
4,Just got back from visiting an old friend! I love keeping in touch with people
4,Good Morning Folks! We dodged the severe storms so all is good! They said NO severe storms today!! Hooray
0,I hate missing out- it's miracle Sunday in church and I'm at the hospital but I have my list I'm believing God for! I'm excited!
0,@Emily0309 yeah definitely not the right time if it were I would take him in a heartbeat. He's white but from his collar up is brown
0,@4thirty5 that is the saddest news ever. Zune salute to you.
4,@jdcb42 Swine Flu victims unite http://oinkflu.info
4,"Omgosh he is OH! So cute "
4,@JoeCostello We can officially confirm that the Monarch butterfly is on BOTH water towers in Papillion!
36 changes: 35 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import csv
import logging
import mock
import pytest
from datarobot_batch_scoring.utils import (verify_objectid, UI,
iter_chunks, acquire_api_token)
from datarobot_batch_scoring.batch_scoring import BatchGenerator


def test_invalid_objectid():
Expand Down Expand Up @@ -114,7 +116,7 @@ def test_fatal(self):
with mock.patch(
'datarobot_batch_scoring.utils.root_logger') as m_root:
with mock.patch(
'datarobot_batch_scoring.utils.sys.exit') as m_exit:
'datarobot_batch_scoring.utils.os._exit') as m_exit:
ui.fatal('text')
m_log.error.assert_called_with(msg)
m_root.error.assert_called_with(msg,
Expand Down Expand Up @@ -153,6 +155,38 @@ def test_iter_chunks():
next(it)


def test_investigate_encoding_and_dialect():
ui = UI(None, logging.DEBUG, stdout=False)
data = 'tests/fixtures/windows_encoded.csv'
bg = BatchGenerator(dataset=data, n_samples=10, n_retry=3,
delimiter=None, ui=ui, fast_mode=False)
bg.investigate_encoding_and_dialect()
(encoding, dialect) = bg.investigate_encoding_and_dialect()
assert encoding == 'iso-8859-2'
assert dialect.lineterminator == '\r\n'
assert dialect.quotechar == '"'
assert dialect.delimiter == ','


def test_stdout_logging_and_csv_module_fail(capsys):
ui = UI(None, logging.DEBUG, stdout=True)
data = 'tests/fixtures/unparsable.csv'
bg = BatchGenerator(dataset=data, n_samples=10, n_retry=3,
delimiter=None, ui=ui, fast_mode=False)
exc = str("""[ERROR] The csv module failed to detect the CSV dialect. """ +
"""Try giving hints with the --delimiter argument, E.g """ +
"""--delimiter=','""")
msg = ('{}\nIf you need assistance please send the log \n'
'file {} to support@datarobot.com .').format(
exc, ui.root_logger_filename)
with mock.patch('datarobot_batch_scoring.utils.os._exit') as m_exit:
with pytest.raises(csv.Error):
bg.investigate_encoding_and_dialect()
m_exit.assert_called_with(1)
out, err = capsys.readouterr()
assert msg == out.strip('\n')


def test_acquire_api_token(live_server):
ui = mock.Mock()
base_url = '{webhost}/api/v1/'.format(webhost=live_server.url())
Expand Down

0 comments on commit aaf93aa

Please sign in to comment.