* Author: Gordon Erlebacher
* Date: 2022-03-03
* Purpose: convert Sent field to either date (nb seconds from a fixed time) or an empty field.

In [202]:
import pandas as pd
import numpy as np
import regex as rex
from datetime import datetime
from dateutil import parser
from dateutil.tz import gettz
from unidecode import unidecode
import pytz
import date_library as datelib

In [2]:
parser.parse("Sat Oct 11 17:13:46 UTC 2003")

datetime.datetime(2003, 10, 11, 17, 13, 46, tzinfo=tzutc())

In [3]:
filenm = "output_with_stats_columns.csv.gz"
df = pd.read_csv(filenm)

In [332]:

# Standard time :  UTC-5.0 
# Daylight time :  UTC-4.0

# tzinfo = {"EST": 0, # -5, 
#           "EDT": 3600, #-4,
#           "PDT": -7200,  #-7
#           "PST": -10800,  #-8,
#           "HST": -18000,  #-10,jjj
#           "MDT": -3600,  # -6
#           "CDT": 0,   #-5
#           "CST": -3600,   #-6
#          }

tzinfo = {
          "EDT": pytz.timezone('US/Eastern'), #-4,
          "CDT": pytz.timezone('US/Central'),
          "MDT": pytz.timezone('US/Mountain'),
          "PDT": pytz.timezone('US/Pacific'),  #-7
          "HDT": pytz.timezone('US/Hawaii'),  #-7
          "GMT": pytz.timezone('UTC'),
    
          # We will convert all times to UTC, then to seconds since 1970
          # Then we will add/subtrace one hour (3600 sec) if time is in 
          # Savings Time (as opposed to Daylight Time)
          "EST": pytz.timezone('US/Eastern'), #-4,
          "CST": pytz.timezone('US/Central'),
          "MST": pytz.timezone('US/Mountain'),
          "PST": pytz.timezone('US/Pacific'),  #-7
          "HST": pytz.timezone('US/Hawaii'),  #-7
    
          # "EST": 0, # -5, 
          # "PST": -10800,  #-8,
          # "HST": -18000,  #-10,jjj
          # "CST": -3600,   #-6
         }

# For example: when Eastern Daylight Time is used, the international meetings are held 
# between 10 am to 6 pm whereas for Eastern Standard Time the same meeting and conference 
# are held between 9 am to 5 pm.  So, 10 am EDT  <=====> 9 am EST

# for k,v in tzinfo.items():
#     tzinfo[k] -= 5 * 3600


# tzinfo = {"EST": 0, # -5, 
#           "EDT": -3600, #-4,
#           "PDT": 7200,  #-7
#           "PST": 10800,  #-8,
#           "HST": 18000,  #-10,
#           "MDT": 3600,  # -6
#           "CDT": 0,   #-5
#           "CST": 3600,   #-6
#          }

def normalize(name1, exceptions, date_dict, time_zone):
    #name1 = unidecode.unidecode(name1)
    try: 
        if type(name1) != str:
            date_dict.append((name1, '', ''))
            time_zone.append('EDT')
            return
        # print("1, name1: ", name1)
        if rex.match(r'.*GMT', name1):   # missing some matches. WHY?
            print("name1: ", name1)
            name = name1
            name = rex.sub(r'(GMT-05:00)', 'EST', name)
            print("    name: ", name)
            name = rex.sub(r'(\d?\/\d?\/)(\d\d )', r'\120\2', name)  # will fail in the year 21xx <<<< NOT WORKING
            print("    name: ", name)
        else:
            name = name1
        # print("2")
        if not rex.match(r'.*\d{4}', name):
            # date_dict[name1] = ''
            date_dict.append((name1, '', ''))
            time_zone.append('EDT')
            return
        name = rex.sub('(Febnlaiy|Febnlaly|Febiuaiy|Feb1ua1y|Februa ry)', 'February', name1)
        name = rex.sub('(Janualy|J anuary)', 'January', name)
        name = rex.sub(r"(\'iuesday)", 'Tuesday', name) 
        name = rex.sub('Septem ber', 'September', name)
        #name = rex.sub(r'(\d\d?\/\d\d?)\/(\d\d)',r'\1\/20\2', name)  # will fail in the year 21xx 



        name = rex.sub(r',(\d{4})', r', \1', name)
        # name = rex.sub(rii'(\d{4})(\d\d:)', r'\1 \2', name)
        name = rex.sub(r'(\d{4} \d) :', r'\1:', name) 
        name = rex.sub(r'(\d{4} \d+:) (\d)', r'\1\2', name)  # CHECK ORIGINAL TEXT. Why does this error occur?
        name = rex.sub(r'(\d+:\d+:) (\d+)', r'\1\2', name)  # 12:42: 22
        # Tuesday, August 26: 2014 3:07 PM
        name = rex.sub(r'(\w+ \d+):( \d{4})', r'\1, \2', name)
        name = rex.sub(r'\>', ':', name)
        name = rex.sub(r'\?', '', name)
    
        tz = 'EST'
        if rex.match('.*Eastern Daylight Time', name):
            name = rex.sub('Eastern Daylight Time', 'EDT', name)  # difference between Daylight and Standard
            tz = 'EDT'
        elif rex.match('.*Eastern Standard Time', name):
            name = rex.sub('Eastern Standard Time', 'EST', name)
            tz = 'EST'
        elif rex.match('.*PST', name):
            tz = 'PST'
        elif rex.match('.*PDT', name):   # Pacific
            tz = 'PDT'
        elif rex.match('.*HST', name):   # Hawaii
            tz = 'HST'
        elif rex.match('.*MDT', name):   # Mountain
            tz = 'MDT'
        elif rex.match('.*CDT', name):   # Central
            tz = 'CDT'
        elif rex.match('.*CST', name):   # Central
            tz = 'CST'
        elif rex.match('.*GMT', name):
            tz = 'GMT'
        else:
            tz = 'EDT'  # default if nothing else

        date_name = parser.parse(name, fuzzy=True, dayfirst=False, tzinfos=tzinfo)

        # t is now a PDT datetime; convert it to UTC
        date_name = date_name.astimezone(pytz.utc)
        # print("0-")
        # print("0: ", date_name)
        date_dict.append((name1, name, date_name))
        # print("1: ", name1, name)
        time_zone.append(tz)
        # print("2")
    except:
        print("except")
        try:
            # print("trytry: time_zone: ", tz)
            date_name = parser.parse(name, fuzzy=True, dayfirst=False, tzinfos=tzinfo)
            date_name = name.astimezone(pytz.utc)
            # date_dict[name1] = name
            date_dict.append((name1, name, date_name))
            time_zone.append(tz)
        except:
            # print("except: ", name)
            # Ignore any line with more than 50 characters
            # Ignore any line with the word "the"
            # Ignore any line with two dates
            if rex.match(r'\A.*the.*\Z', name): 
                date_dict.append((name1, '', ''))
                time_zone.append('')
                return
            # Ignore line if the year appears twice
            if rex.match('\A.*(\d{4}).*(\d{4})', name): 
                date_dict.append((name1, '', ''))
                time_zone.append('')
                return
            # Ignore line if there is no number
            if not rex.match('.*\d', name): 
                date_dict.append((name1, '', ''))
                time_zone.append('')
                return
            #print("-> ", name)
            if len(name) > 40:
                date_dict.append((name1, '', ''))
                time_zone.append('')
                return
            # if there is year in the string
            if not rex.match(r'.*\d{4}', name):
                date_dict.append((name1, '', ''))
                time_zone.append('')
                return
            #print("=> ", name)                
            exceptions.append(name)
            date_dict.append((name1, '', ''))
            time_zone.append('')
                
# Print the date in normalized form so I can spot check. 
# Then save them to a file. 
# On any date that is not valid, make it empty, tag the row and remove it from the output.csv file. 

In [333]:
tzinfo

{'EDT': <DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>,
 'CDT': <DstTzInfo 'US/Central' LMT-1 day, 18:09:00 STD>,
 'MDT': <DstTzInfo 'US/Mountain' LMT-1 day, 17:00:00 STD>,
 'PDT': <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>,
 'HDT': <DstTzInfo 'US/Hawaii' LMT-1 day, 13:29:00 STD>,
 'GMT': <UTC>,
 'EST': <DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>,
 'CST': <DstTzInfo 'US/Central' LMT-1 day, 18:09:00 STD>,
 'MST': <DstTzInfo 'US/Mountain' LMT-1 day, 17:00:00 STD>,
 'PST': <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>,
 'HST': <DstTzInfo 'US/Hawaii' LMT-1 day, 13:29:00 STD>}

In [334]:
sent_lst = df['Sent'].values
len(sent_lst)

39444

In [335]:
name = 'Janualy'
name = rex.sub('Janualy', 'January', name)
name

'January'

In [336]:
exceptions = []
date_dict = []
time_zone = []
for sent in sent_lst:
    normalize(sent, exceptions, date_dict, time_zone)
    
len(exceptions), len(sent_lst), df.shape

name1:  2/3/17 2:12 PM (GMT-05:00)
    name:  2/3/17 2:12 PM (EST)
    name:  P17 2:12 PM (EST)
except
name1:  2/3/17 1:29 PM (GMT-05:00)
    name:  2/3/17 1:29 PM (EST)
    name:  P17 1:29 PM (EST)
except
name1:  2/3/17 2:12 PM (GMT-05:00)
    name:  2/3/17 2:12 PM (EST)
    name:  P17 2:12 PM (EST)
name1:  02/15/2013 4:30 PM (GMT-05:00)
    name:  02/15/2013 4:30 PM (EST)
    name:  02/15/2013 4:30 PM (EST)
name1:  11/23/2015 9:57 AM (GMT-05:00)
    name:  11/23/2015 9:57 AM (EST)
    name:  11/23/2015 9:57 AM (EST)
name1:  12/05/2013 8:05 AM (GMT-05:00)
    name:  12/05/2013 8:05 AM (EST)
    name:  12/05/2013 8:05 AM (EST)
except
except
name1:  2/16/17 3:25 PM (GMT-05:00)
    name:  2/16/17 3:25 PM (EST)
    name:  2/16/17 3:25 PM (EST)
except
name1:  2017-02-12 13:59 GMT-05:00
    name:  2017-02-12 13:59 EST
    name:  2017-02-12 13:59 EST
except
except
name1:  1/20/17 10:28 AM (GMT-05:00)
    name:  1/20/17 10:28 AM (EST)
    name:  1/20/17 10:28 AM (EST)
name1:  2/3/17 2:12 PM (

(0, 39444, (39444, 30))

In [337]:
len(date_dict)

39444

* 'Friday, May 13, 201611:18 AM': datetime.datetime(2022, 5, 13, 18, 16, 11),
* 

In [200]:
dates_orig = []
dates_new = []
dates_date = []
date_adj = []
timestamp = []
for i, (tz_el, el) in enumerate(zip(time_zone, date_dict)):
    dates_orig.append(el[0])
    dates_new.append(el[1])
    dates_date.append(el[2])
    try:
        # subtract 5 hours to convert back to Tallahassee time
 
        if False:  # take daylight savings properly into account
            if tz_el[1] == 'S':  # adjust time if Savings
                timestp -= 3600
        else:  
            timestp = el[2].timestamp() - 5 * 3600 
            
        timestamp.append(timestp)
        # Transform time from timestamp back to UTC
        dtime = datelib.timestampToDateTimeUTC(timestp)
        date_adj.append(dtime)
    except:
        timestamp.append(-1)
        date_adj.append(('',''))
    
df1 = pd.DataFrame({'orig':dates_orig, 'new':dates_new, 'date':dates_date, 'TZ': time_zone, 'date_adj': date_adj, 'timestamp': timestamp})
print(len(date_dict), len(dates_orig), df.shape)

df1.to_csv("dates.csv", index=0)

39444 39444 (39444, 30)


In [120]:
df.shape, df1.shape

((39444, 30), (39444, 6))

In [78]:
# Add new columns: new date and time, and number of seconds since 1970
# timestamp: seconds since 1970
# dates_orig: original send column
# date_adj[0]: adjusted date
# date_adj[1]: adjusted time
adj_date = []
adj_time = []
for dat_tim in date_adj:
    adj_date.append(dat_tim[0])
    adj_time.append(dat_tim[1])
    
df['timestamp'] = timestamp
df['date_sent'] = adj_date
df['time_sent'] = adj_time
df.columns

df.to_csv("output_stats_timestamps.csv.gz", index=0)

In [79]:
timestamp;
df.columns

Index(['filenm', 'From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachments',
       'Importance', 'isThread', 'isAutoMessage', 'isDisplacement',
       'hasAllCapLine', 'hasBadDate', 'Body', 'nb_words', 'nb_chars',
       'body_len', 'body', 'Error_from', 'Error_sent', 'mn_nb_words',
       'std_nb_words', 'mn_nb_chars', 'std_nb_chars', 'count', 'email_count',
       'timestamp', 'date_sent', 'time_sent'],
      dtype='object')

In [31]:
df.head()

Unnamed: 0,filenm,From,Sent,To,CC,Bcc,Subject,Attachments,Importance,isThread,...,Error_sent,mn_nb_words,std_nb_words,mn_nb_chars,std_nb_chars,count,email_count,timestamp,date_sent,time_sent
0,29142_fn_10-4-Cascade-2015-1-0_ln_42056.txt,"('shane a. moniz', ' ', 'smoniz@connandassocia...","Friday, May 08, 2015 11:38 AM","[('brooks', 'hayes', 'brooks@culpeppercc.com')]","[('ryan', 'grindler', 'rgrindler@101tally.com'...",,Edison RFI's,RFI 008 Response.pdf; RFI 042 Response.pdf; RF...,,False,...,,207.191781,575.220722,1203.369863,3298.465917,73,73,1431081000.0,2015-05-08,10:38
1,41353_fn_10-3-Cascade-2014-2-0_ln_16811.txt,"('mark', 'beaudoin', 'mark.beaudoin@talgov.com')","Tuesday, August 19, 2014 5:21 PM","[('kristen', 'coons', 'kristen.coons@talgov.co...","[('patrick', 'hurley', 'patrick_hurley')]",,RE: Revised lease,,,False,...,,216.734406,954.868984,1136.110664,5063.717627,497,497,1408465000.0,2014-08-19,16:21
2,01216_fn_17-2-IB2013-1-0_ln_44062.txt,"('ken', 'morris', 'morrisk@leoncountyfl.gov')","Tuesday, May 28, 2013 1:22 PM","[('kim', 'rivers', 'kim@inkbridge.com')]","[('jon', 'brown', 'brownjon@leoncountyfl.gov')...",,Proposed Imagine Schedule,,,False,...,,789.666667,1367.963784,4514.0,8106.663649,117,117,1369744000.0,2013-05-28,12:22
3,32414_fn_32-1-PaigeCS-1-1_ln_18843.txt,"('jennifer', 'naff', 'director@springtimetalla...","Wednesday, March 15, 2017 5:32 PM","[('alison', 'faris', 'alison.faris@talgov.com')]","[('paige', 'carter', 'paige.tallahasseedowntow...",,RE: Springtime Parade,,,False,...,,448.125,684.897547,2302.75,3555.282145,16,16,1489596000.0,2017-03-15,16:32
4,56710_fn_31-2-GaryYordon2-3_ln_41027.txt,"('paul', 'lamaster', 'paullamaster250@comcast....","Sunday, June 7, 2015 at 1:33:39 PM Eastern Day...","[('andrew', 'gillum', 'andrew.gillum@talgov.co...","[('f72764', 'l72764', 'alan.williams@my')]",,Tony Carvajal: Develop strategy to get ahead o...,,,False,...,,178.145418,131.073968,1212.14484,686.257253,3459,3459,1433684000.0,2015-06-07,13:29


* Tranform UTC time into the number of seconds since a fixed time.
* Adjust the time using ADJ_data column
* Tranform back to time

In [18]:
for e in exceptions:
    print("==> ", e)

In [99]:
def normalize_sent(name, exceptions):
    """ 
        transform `name` into a proper date for postprocessing
    """
    try:
        name = rex.sub(r'\A(Mon|Tue|Wed|Thur|Fri|Sat|Sun)\w+, ?(.*\Z)', r'\2', name)
        name = rex.sub(' Eastern Daylight Time.*', '', name)  # difference between Daylight and Standard
        name = rex.sub(' Eastern Standard Time.*', '', name)
        name = rex.sub('(GMT-05:00)', 'EST', name)
        name = rex.sub('(\d:\d\d):\d\d', r'\1', name)
        name = rex.sub('(\d, \d{4}) at (\d?\d:\d?)', r'\1 \2', name)
        name = rex.sub('(\d:\d\d \w\w) (EDT|EST|HST|EDT|PDT)', r'\1', name)
        name = rex.sub(r',', ' ', name)
        name = rex.sub('\s+', ' ', name)
        name = rex.sub('\A(\s?(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun))\s(.*\Z)', r'\3', name)
        new_date = datetime.strptime(name.strip(), "%A %B %d %Y %I:%M %p") 
    except:
        try:
            new_date = datetime.strptime(name.strip(), "%B %d %Y %I:%M %p")
        except:
            try:
                new_date = datetime.strptime(name.strip(), "%b %d %Y %I:%M %p")
            except:
                if type(name) == str:  # exclude nan
                    exceptions.append(name)
                pass

In [100]:
name = 'Dec 29 2013 9:14 AM'
new_date = datetime.strptime(name.strip(), "%b %d %Y %I:%M %p")

In [101]:
exceptions = []
for sent in sent_lst:
    normalize_sent(sent, exceptions)

In [102]:
len(exceptions), len(sent_lst)

(806, 39444)

In [103]:
name = 'April 24, 2013 8:46 AM EDT'
name = rex.sub('(\d:\d\d \w\w) EDT', r'\1', name)

print(name)

April 24, 2013 8:46 AM


In [104]:
exceptions

['9 May 2017 21:08 +0000',
 'November 18 2016 4:33 PM Ride Yellow',
 '2/3/17 2:12 PM (EST)',
 'May 13 201611:18 AM',
 '?January? ?28? ?2014 ?10?:?10? ?AM',
 'May 19 2016 at 1>15>45 PM EDT',
 '12/31/2012 8:28 AM',
 ' 4/3/2013 5:32 PM',
 '- May 5 2013 2:26 PM',
 '2/3/17 1:29 PM (EST)',
 ' Whom It May Concern',
 ' January 11 2013 9:43 AM Newspaper clips 1-11-13 clips 1-11-13.docx Kelley Lizzy <Lizzy.Kelley@talgov.com Friday January 11 2013 9:43 AM',
 'January 4 ',
 '2/3/17 2:12 PM (EST)',
 '?January? ?13? ?2014 ?4?:?49? ?PM',
 '02/15/2013 4:30 PM (EST)',
 '18 July 2013 14:14',
 ' 4/3/2013 12:43 PM',
 '11/23/2015 9:57 AM (EST)',
 '12/05/2013 8:05 AM (EST)',
 '2/16/17 3:25 PM (EST)',
 'May 25 2015 4:35 PM Monday February 12 2018 7:42 PM',
 '2/8/2016 5:40 PM',
 ' paige.tallahasseedowntown@gmail.com',
 '20 Mar 2013 17:40 -0400',
 '2017-02-12 13:59 EST',
 ' high poverty and crime areas.',
 'February 20 2013 from 8am to 5pm',
 'February 24 6:47 PM',
 ' 3/16/2013 2:26 PM',
 '1/20/17 10:28 AM (ES