In [1]:
import numpy as np
import pandas as pd
import datetime
import re
import sys

In [2]:
raw = pd.read_csv('sample.csv', encoding='UTF-8',
                #dtype={'FooDuration', pd.Timedelta}
                  parse_dates=['Timestamp']
                 )

In [3]:
raw

Unnamed: 0,Timestamp,Address,ZIP,FullName,FooDuration,BarDuration,TotalDuration,Notes
0,2011-04-01 11:00:00,"123 4th St, Anywhere, AA",94121,Monkey Alberto,1:23:32.123,1:32:33.123,zzsasdfa,I am the very model of a modern major general
1,2014-03-12 00:00:00,"Somewhere Else, In Another Time, BB",1,Superman übertan,111:23:32.123,1:32:33.123,zzsasdfa,This is some Unicode right here. ü ¡! 😀
2,2016-02-29 12:11:11,111 Ste. #123123123,1101,Résumé Ron,31:23:32.123,1:32:33.123,zzsasdfa,🏳️🏴🏳️🏴
3,2011-01-01 00:00:01,"This Is Not An Address, BusyTown, BT",94121,Mary 1,1:23:32.123,0:00:00.000,zzsasdfa,I like Emoji! 🍏🍎😍
4,2016-12-31 23:59:59,"123 Gangnam Style Lives Here, Gangnam Town",31403,Anticipation of Unicode Failure,1:23:32.123,1:32:33.123,zzsasdfa,I like Math Symbols! ≱≰⨌⊚
5,2011-11-11 11:11:11,überTown,10001,Prompt Negotiator,1:23:32.123,1:32:33.123,zzsasdfa,"I’m just gonna say, this is AMAZING. WHAT NEGO..."
6,2010-05-12 16:48:12,Høøük¡,1231,Sleeper Service,1:23:32.123,1:32:33.123,zzsasdfa,2/1/22
7,2012-10-05 22:31:11,"Test Pattern Town, Test Pattern, TP",121,株式会社スタジオジブリ,1:23:32.123,1:32:33.123,zzsasdfa,1:11:11.123
8,2004-10-02 08:44:11,The Moon,11,HERE WE GO,1:23:32.123,1:32:33.123,zzsasdfa,


In [4]:
# convert timestamp from US/Pacific to US/Eastern ==> add 3 hours
raw.Timestamp = raw.Timestamp.apply(lambda ts: ts.tz_localize('US/Pacific').tz_convert('US/Eastern'))

In [5]:
def convert_interval(intv_str):
    """Assume format is <HHH:MIN:SEC.MSEC>. 
    It's easier to use a regex to parse this which allows a colon or period to separate fields.
    If data was not well formed this could cause errors.
    This converts to a decimalized <seconds.microseconds>
    """
    # check to see if we've already converted it e.g. we're re-running this
    if isinstance(intv_str, float):
        return intv_str
    _i = re.compile('[:.]').split(intv_str)
    _i = list(map(lambda x: int(x), _i))
    _dt = datetime.timedelta(hours=_i[0], minutes=_i[1], seconds=_i[2], milliseconds=_i[3])
    return _dt.seconds + _dt.microseconds/1e6

In [6]:
# Convert string interval into a seconds.microseconds object
raw.FooDuration = raw.FooDuration.apply(convert_interval)
raw.BarDuration = raw.BarDuration.apply(convert_interval)

In [7]:
# calculate new value for TotalDuration as it is filled with garbage
raw.TotalDuration = raw.BarDuration + raw.FooDuration

In [8]:
# wrap x in an int so we can re-run this without errors (idempotent)
raw.ZIP = raw.ZIP.apply(lambda x: f"{int(x):05d}")

In [9]:
# convert names to uppercase
raw['FullName'] = raw['FullName'].apply(str.upper)

In [10]:
raw

Unnamed: 0,Timestamp,Address,ZIP,FullName,FooDuration,BarDuration,TotalDuration,Notes
0,2011-04-01 14:00:00-04:00,"123 4th St, Anywhere, AA",94121,MONKEY ALBERTO,5012.123,5553.123,10565.246,I am the very model of a modern major general
1,2014-03-12 03:00:00-04:00,"Somewhere Else, In Another Time, BB",1,SUPERMAN ÜBERTAN,55412.123,5553.123,60965.246,This is some Unicode right here. ü ¡! 😀
2,2016-02-29 15:11:11-05:00,111 Ste. #123123123,1101,RÉSUMÉ RON,26612.123,5553.123,32165.246,🏳️🏴🏳️🏴
3,2011-01-01 03:00:01-05:00,"This Is Not An Address, BusyTown, BT",94121,MARY 1,5012.123,0.0,5012.123,I like Emoji! 🍏🍎😍
4,2017-01-01 02:59:59-05:00,"123 Gangnam Style Lives Here, Gangnam Town",31403,ANTICIPATION OF UNICODE FAILURE,5012.123,5553.123,10565.246,I like Math Symbols! ≱≰⨌⊚
5,2011-11-11 14:11:11-05:00,überTown,10001,PROMPT NEGOTIATOR,5012.123,5553.123,10565.246,"I’m just gonna say, this is AMAZING. WHAT NEGO..."
6,2010-05-12 19:48:12-04:00,Høøük¡,1231,SLEEPER SERVICE,5012.123,5553.123,10565.246,2/1/22
7,2012-10-06 01:31:11-04:00,"Test Pattern Town, Test Pattern, TP",121,株式会社スタジオジブリ,5012.123,5553.123,10565.246,1:11:11.123
8,2004-10-02 11:44:11-04:00,The Moon,11,HERE WE GO,5012.123,5553.123,10565.246,


In [11]:
raw.dtypes

Timestamp        datetime64[ns, US/Eastern]
Address                              object
ZIP                                  object
FullName                             object
FooDuration                         float64
BarDuration                         float64
TotalDuration                       float64
Notes                                object
dtype: object

## write file to stdout

In [12]:
raw.to_csv(sys.stdout, index=False, date_format='%FT%H:%M:%S%z')

Timestamp,Address,ZIP,FullName,FooDuration,BarDuration,TotalDuration,Notes
2011-04-01T14:00:00-0400,"123 4th St, Anywhere, AA",94121,MONKEY ALBERTO,5012.123,5553.123,10565.246,I am the very model of a modern major general
2014-03-12T03:00:00-0400,"Somewhere Else, In Another Time, BB",00001,SUPERMAN ÜBERTAN,55412.123,5553.123,60965.246,This is some Unicode right here. ü ¡! 😀
2016-02-29T15:11:11-0500,111 Ste. #123123123,01101,RÉSUMÉ RON,26612.123,5553.123,32165.246,🏳️🏴🏳️🏴
2011-01-01T03:00:01-0500,"This Is Not An Address, BusyTown, BT",94121,MARY 1,5012.123,0.0,5012.123,I like Emoji! 🍏🍎😍
2017-01-01T02:59:59-0500,"123 Gangnam Style Lives Here, Gangnam Town",31403,ANTICIPATION OF UNICODE FAILURE,5012.123,5553.123,10565.246,I like Math Symbols! ≱≰⨌⊚
2011-11-11T14:11:11-0500,überTown,10001,PROMPT NEGOTIATOR,5012.123,5553.123,10565.246,"I’m just gonna say, this is AMAZING. WHAT NEGOTIATIONS."
2010-05-12T19:48:12-0400,Høøük¡,01231,SLEEPER SERVICE,5012.123,5553.123,10565.246,2/1/22
2012-10-06T01

## Now read in "bad" file

In [13]:
# an error is thrown using read_csv on raw file, so we open and allow library to convert it
#"UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 28: invalid start byte"
input_fd = open('sample-with-broken-utf8.csv', encoding='UTF-8', errors='replace')
raw_bad = pd.read_csv(input_fd)

In [14]:
raw_bad

Unnamed: 0,Timestamp,Address,ZIP,FullName,FooDuration,BarDuration,TotalDuration,Notes
0,4/1/11 11:00:00 AM,"123 4th St, Anywhere, AA",94121,Monkey Alberto,1:23:32.123,1:32:33.123,zzsasdfa,I am the very model of a modern major general
1,3/12/14 12:00:00 AM,"Somewhere Else, In Another Time, BB",1,Superman übertan,111:23:32.123,1:32:33.123,zzsasdfa,This is some Unicode right h�xxx ü ¡! 😀
2,2/29/16 12:11:11 PM,111 Ste. #123123123,1101,Résumé Ron,31:23:32.123,1:32:33.123,zzsasdfa,🏳️🏴🏳️🏴
3,1/1/11 12:00:01 AM,"This Is Not An Address, BusyTown, BT",94121,Mary 1,1:23:32.123,0:00:00.000,zzsasdfa,I like Emoji! 🍏🍎😍
4,12/31/16 11:59:59 PM,"123 Gangnam Style Lives Here, Gangnam Town",31403,Anticipation of Unicode Failure,1:23:32.123,1:32:33.123,zzsasdfa,I like Math Symbols! ≱≰⨌⊚
5,11/11/11 11:11:11 AM,überTown,10001,Prompt Negotiator,1:23:32.123,1:32:33.123,zzsasdfa,"I’m just gonna say, this is AMAZING. WHAT NEGO..."
6,5/12/10 4:48:12 PM,Høøük¡,1231,Sleeper Service,1:23:32.123,1:32:33.123,zzsasdfa,2/1/22
7,10/5/12 10:31:11 PM,"Test Pattern Town, Test Pattern, TP",121,株式会社スタジオジブリ,1:23:32.123,1:32:33.123,zzsasdfa,1:11:11.123
8,10/2/04 8:44:11 AM,The Moon,11,HERE WE GO,1:23:32.123,1:32:33.123,zzsasdfa,
