# Lecture 10 Cleaning Data - Part 2 - Formatting Values
__Math 3080: Fundamentals of Data Science__

Reading:
* [McKinney, *Python for Data Science*, Chapter 6](https://wesmckinney.com/book/accessing-data)
* [McKinney, *Python for Data Science*, Chapter 11](https://wesmckinney.com/book/time-series)

Class notes are found through GitHub. As changes are made, they will automatically be uploaded to GitHub. A link to the repository is on Canvas.

-----
## Outline
* String Splits
* Data Types
* Datetime format

In [3]:
import pandas as pd

data = pd.DataFrame(['2024-01-01 12:00:00',
                     '2024-01-02 12:04:06',
                     '2024-01-03 11:24:56',
                     '2024-01-04 11:42:07',
                     '2024-01-05 12:15:33',
                     '2024-01-06 11:59:53'], columns=['Date/Time'])

data

Unnamed: 0,Date/Time
0,2024-01-01 12:00:00
1,2024-01-02 12:04:06
2,2024-01-03 11:24:56
3,2024-01-04 11:42:07
4,2024-01-05 12:15:33
5,2024-01-06 11:59:53


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date/Time  6 non-null      object
dtypes: object(1)
memory usage: 176.0+ bytes


In [8]:
data['Date/Time'].str.split(' ')

0    [2024-01-01, 12:00:00]
1    [2024-01-02, 12:04:06]
2    [2024-01-03, 11:24:56]
3    [2024-01-04, 11:42:07]
4    [2024-01-05, 12:15:33]
5    [2024-01-06, 11:59:53]
Name: Date/Time, dtype: object

In [10]:
data['Date'], data['Time'] = zip(*data['Date/Time'].str.split(' '))
data

Unnamed: 0,Date/Time,Date,Time
0,2024-01-01 12:00:00,2024-01-01,12:00:00
1,2024-01-02 12:04:06,2024-01-02,12:04:06
2,2024-01-03 11:24:56,2024-01-03,11:24:56
3,2024-01-04 11:42:07,2024-01-04,11:42:07
4,2024-01-05 12:15:33,2024-01-05,12:15:33
5,2024-01-06 11:59:53,2024-01-06,11:59:53


In [11]:
data['Year'], data['Month'], data['Day'] = zip(*data['Date'].str.split('-'))
data['Hour'], data['Minute'], data['Second'] = zip(*data['Time'].str.split(':'))
data

Unnamed: 0,Date/Time,Date,Time,Year,Month,Day,Hour,Minute,Second
0,2024-01-01 12:00:00,2024-01-01,12:00:00,2024,1,1,12,0,0
1,2024-01-02 12:04:06,2024-01-02,12:04:06,2024,1,2,12,4,6
2,2024-01-03 11:24:56,2024-01-03,11:24:56,2024,1,3,11,24,56
3,2024-01-04 11:42:07,2024-01-04,11:42:07,2024,1,4,11,42,7
4,2024-01-05 12:15:33,2024-01-05,12:15:33,2024,1,5,12,15,33
5,2024-01-06 11:59:53,2024-01-06,11:59:53,2024,1,6,11,59,53


In [12]:
data['Minute'].sum()

'000424421559'

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date/Time  6 non-null      object
 1   Date       6 non-null      object
 2   Time       6 non-null      object
 3   Year       6 non-null      object
 4   Month      6 non-null      object
 5   Day        6 non-null      object
 6   Hour       6 non-null      object
 7   Minute     6 non-null      object
 8   Second     6 non-null      object
dtypes: object(9)
memory usage: 560.0+ bytes


In [15]:
data['Hour'] = data['Hour'].apply(int)
data['Minute'] = data['Minute'].apply(int)
data['Second'] = data['Second'].apply(int)
data

Unnamed: 0,Date/Time,Date,Time,Year,Month,Day,Hour,Minute,Second
0,2024-01-01 12:00:00,2024-01-01,12:00:00,2024,1,1,12,0,0
1,2024-01-02 12:04:06,2024-01-02,12:04:06,2024,1,2,12,4,6
2,2024-01-03 11:24:56,2024-01-03,11:24:56,2024,1,3,11,24,56
3,2024-01-04 11:42:07,2024-01-04,11:42:07,2024,1,4,11,42,7
4,2024-01-05 12:15:33,2024-01-05,12:15:33,2024,1,5,12,15,33
5,2024-01-06 11:59:53,2024-01-06,11:59:53,2024,1,6,11,59,53


In [16]:
data['Minute'].sum()

144

In [18]:
#data.apply(lambda x: x['Year'] + x['Month']/12 + x['Day']/(12*30), axis=1)
data.apply(lambda x: x['Year'][3] + x['Month'][1], axis=1)

0    41
1    41
2    41
3    41
4    41
5    41
dtype: object

In [19]:
data['Date/Time']

0    2024-01-01 12:00:00
1    2024-01-02 12:04:06
2    2024-01-03 11:24:56
3    2024-01-04 11:42:07
4    2024-01-05 12:15:33
5    2024-01-06 11:59:53
Name: Date/Time, dtype: object

In [23]:
data['Date/Time'].apply(lambda x: x[8:10])

0    01
1    02
2    03
3    04
4    05
5    06
Name: Date/Time, dtype: object

In [25]:
# Datetime Format
from datetime import datetime

datetime.now()

datetime.datetime(2024, 1, 29, 17, 54, 49, 142959)

In [27]:
now = datetime.now()
print(now.month, now.day, ", ", now.year)
print(now.hour, ":", now.minute + now.second/60)

1 29 ,  2024
17 : 57.7


In [28]:
# Convert a date into datetime format
timestamp = datetime(2018, 5, 5, 8, 15, 54)
timestamp

datetime.datetime(2018, 5, 5, 8, 15, 54)

In [29]:
print(timestamp.hour, ":", timestamp.minute + timestamp.second/60)

8 : 15.9


In [31]:
timestamp.date()

datetime.date(2018, 5, 5)

In [32]:
timestamp.time()

datetime.time(8, 15, 54)

In [34]:
# Timezones
import pytz

tz = pytz.timezone('UTC')
tz2 = pytz.timezone('US/Eastern')

print(datetime.now())
print(datetime.now(tz))
print(datetime.now(tz2))

2024-01-29 18:04:08.821157
2024-01-30 01:04:08.821477+00:00
2024-01-29 20:04:08.821624-05:00


In [35]:
# Changes in Time
datetime(2024, 1, 31) - datetime(2001, 5, 9)

datetime.timedelta(days=8302)

In [36]:
datetime(2024, 1, 31, 9, 15) - datetime(2001, 5, 9, 7, 0)

datetime.timedelta(days=8302, seconds=8100)

In [38]:
from datetime import timedelta
datetime(2024,1,31) - timedelta(days=8302)

datetime.datetime(2001, 5, 9, 0, 0)

In [39]:
# Datetime to string
str(datetime.now())

'2024-01-29 18:12:05.195113'

To see the *datetime* format specification list, see tables 11.2 and 11.3 in your textbook.

In [43]:
datetime.now().strftime("%H:%M:%s %d %h %Y")

'18:14:1706577248 29 Jan 2024'

In [44]:
# String to Datetime
the_date = "May 9, 2001"
datetime.strptime(the_date, "%b %d, %Y")

datetime.datetime(2001, 5, 9, 0, 0)

In [46]:
display(data)
print(data.info())

data['Date/Time'] = pd.to_datetime(data['Date/Time'])

display(data)
print(data.info())

Unnamed: 0,Date/Time,Date,Time,Year,Month,Day,Hour,Minute,Second
0,2024-01-01 12:00:00,2024-01-01,12:00:00,2024,1,1,12,0,0
1,2024-01-02 12:04:06,2024-01-02,12:04:06,2024,1,2,12,4,6
2,2024-01-03 11:24:56,2024-01-03,11:24:56,2024,1,3,11,24,56
3,2024-01-04 11:42:07,2024-01-04,11:42:07,2024,1,4,11,42,7
4,2024-01-05 12:15:33,2024-01-05,12:15:33,2024,1,5,12,15,33
5,2024-01-06 11:59:53,2024-01-06,11:59:53,2024,1,6,11,59,53


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  6 non-null      datetime64[ns]
 1   Date       6 non-null      object        
 2   Time       6 non-null      object        
 3   Year       6 non-null      object        
 4   Month      6 non-null      object        
 5   Day        6 non-null      object        
 6   Hour       6 non-null      int64         
 7   Minute     6 non-null      int64         
 8   Second     6 non-null      int64         
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 560.0+ bytes
None


Unnamed: 0,Date/Time,Date,Time,Year,Month,Day,Hour,Minute,Second
0,2024-01-01 12:00:00,2024-01-01,12:00:00,2024,1,1,12,0,0
1,2024-01-02 12:04:06,2024-01-02,12:04:06,2024,1,2,12,4,6
2,2024-01-03 11:24:56,2024-01-03,11:24:56,2024,1,3,11,24,56
3,2024-01-04 11:42:07,2024-01-04,11:42:07,2024,1,4,11,42,7
4,2024-01-05 12:15:33,2024-01-05,12:15:33,2024,1,5,12,15,33
5,2024-01-06 11:59:53,2024-01-06,11:59:53,2024,1,6,11,59,53


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  6 non-null      datetime64[ns]
 1   Date       6 non-null      object        
 2   Time       6 non-null      object        
 3   Year       6 non-null      object        
 4   Month      6 non-null      object        
 5   Day        6 non-null      object        
 6   Hour       6 non-null      int64         
 7   Minute     6 non-null      int64         
 8   Second     6 non-null      int64         
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 560.0+ bytes
None


In [60]:
data = pd.DataFrame(['2024-01-01 12:00:00',
                     '2024-01-02 12:04:06',
                     '2024-01-03 11:24:56',
                     '2024-01-04 11:42:07',
                     '2024-01-05 12:15:33',
                     '2024-01-06 11:59:53'], columns=['Date/Time'])

data['Date/Time'] = pd.to_datetime(data['Date/Time'])

data['Date'] = data['Date/Time'].apply(lambda x: x.date())
data['Time'] = data['Date/Time'].apply(lambda x: x.time())
data['Year'] = data['Date/Time'].apply(lambda x: x.year)
data['Month'] = data['Date/Time'].apply(lambda x: x.month)
data['Day'] = data['Date/Time'].apply(lambda x: x.day)
data['Hour'] = data['Date/Time'].apply(lambda x: x.hour)
data['Minute'] = data['Date/Time'].apply(lambda x: x.minute)
data['Second'] = data['Date/Time'].apply(lambda x: x.second)
display(data)
data.info()

Unnamed: 0,Date/Time,Date,Time,Year,Month,Day,Hour,Minute,Second
0,2024-01-01 12:00:00,2024-01-01,12:00:00,2024,1,1,12,0,0
1,2024-01-02 12:04:06,2024-01-02,12:04:06,2024,1,2,12,4,6
2,2024-01-03 11:24:56,2024-01-03,11:24:56,2024,1,3,11,24,56
3,2024-01-04 11:42:07,2024-01-04,11:42:07,2024,1,4,11,42,7
4,2024-01-05 12:15:33,2024-01-05,12:15:33,2024,1,5,12,15,33
5,2024-01-06 11:59:53,2024-01-06,11:59:53,2024,1,6,11,59,53


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  6 non-null      datetime64[ns]
 1   Date       6 non-null      object        
 2   Time       6 non-null      object        
 3   Year       6 non-null      int64         
 4   Month      6 non-null      int64         
 5   Day        6 non-null      int64         
 6   Hour       6 non-null      int64         
 7   Minute     6 non-null      int64         
 8   Second     6 non-null      int64         
dtypes: datetime64[ns](1), int64(6), object(2)
memory usage: 560.0+ bytes


In [49]:
data['Day'] = data['Date/Time'].apply(datetime.day)
data.info()

TypeError: 'getset_descriptor' object is not callable