# Preprocessing

In niimpy, **preprocessing** referrs to common processing after loading a dataframe but before doing summary statistics or analysis.  Most of this is done using normal Pandas operations.

In [1]:
import niimpy
df = niimpy.read_csv(niimpy.sampledata.MULTIUSER_AWAREBATTERY_CSV, tz='Europe/Helsinki')

## Extracting rows for only certain users

In [2]:
df[df['user'] == 'jd9INuQ5BBlW']

Unnamed: 0,user,device,time,battery_level,battery_status,battery_health,battery_adaptor,datetime
2020-01-09 02:20:02.924999936+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578529e+09,74,3,2,0,2020-01-09 02:20:02.924999936+02:00
2020-01-09 02:21:30.405999872+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578529e+09,73,3,2,0,2020-01-09 02:21:30.405999872+02:00
2020-01-09 02:24:12.805999872+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578529e+09,72,3,2,0,2020-01-09 02:24:12.805999872+02:00
2020-01-09 02:35:38.561000192+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578530e+09,72,2,2,0,2020-01-09 02:35:38.561000192+02:00
2020-01-09 02:35:38.953000192+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578530e+09,72,2,2,2,2020-01-09 02:35:38.953000192+02:00
...,...,...,...,...,...,...,...,...
2020-01-09 23:02:13.938999808+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578604e+09,73,3,2,0,2020-01-09 23:02:13.938999808+02:00
2020-01-09 23:10:37.262000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578604e+09,73,3,2,0,2020-01-09 23:10:37.262000128+02:00
2020-01-09 23:22:13.966000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578605e+09,72,3,2,0,2020-01-09 23:22:13.966000128+02:00
2020-01-09 23:32:13.959000064+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578606e+09,71,3,2,0,2020-01-09 23:32:13.959000064+02:00


## Extracting rows for a certain time range

In [3]:
df[df.index > '2020-01-09 10:00']

Unnamed: 0,user,device,time,battery_level,battery_status,battery_health,battery_adaptor,datetime
2020-01-09 10:03:23.898000128+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578557e+09,100,2,2,2,2020-01-09 10:03:23.898000128+02:00
2020-01-09 10:03:33.844999936+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1.578557e+09,100,5,2,2,2020-01-09 10:03:33.844999936+02:00
2020-01-09 10:13:34.990000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578558e+09,100,5,2,1,2020-01-09 10:13:34.990000128+02:00
2020-01-09 10:27:35.035000064+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578558e+09,100,5,2,1,2020-01-09 10:27:35.035000064+02:00
2020-01-09 10:42:35.010999808+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578559e+09,100,5,2,1,2020-01-09 10:42:35.010999808+02:00
...,...,...,...,...,...,...,...,...
2020-01-09 23:02:13.938999808+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578604e+09,73,3,2,0,2020-01-09 23:02:13.938999808+02:00
2020-01-09 23:10:37.262000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578604e+09,73,3,2,0,2020-01-09 23:10:37.262000128+02:00
2020-01-09 23:22:13.966000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578605e+09,72,3,2,0,2020-01-09 23:22:13.966000128+02:00
2020-01-09 23:32:13.959000064+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1.578606e+09,71,3,2,0,2020-01-09 23:32:13.959000064+02:00


In [4]:
df[('2020-01-09 10:00' < df.index) & (df.index < '2020-01-09 12:00')]

Unnamed: 0,user,device,time,battery_level,battery_status,battery_health,battery_adaptor,datetime
2020-01-09 10:03:23.898000128+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1578557000.0,100,2,2,2,2020-01-09 10:03:23.898000128+02:00
2020-01-09 10:03:33.844999936+02:00,jd9INuQ5BBlW,3p83yASkOb_B,1578557000.0,100,5,2,2,2020-01-09 10:03:33.844999936+02:00
2020-01-09 10:13:34.990000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578558000.0,100,5,2,1,2020-01-09 10:13:34.990000128+02:00
2020-01-09 10:27:35.035000064+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578558000.0,100,5,2,1,2020-01-09 10:27:35.035000064+02:00
2020-01-09 10:42:35.010999808+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578559000.0,100,5,2,1,2020-01-09 10:42:35.010999808+02:00
2020-01-09 10:57:35.044000+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578560000.0,100,5,2,1,2020-01-09 10:57:35.044000+02:00
2020-01-09 10:58:25.058000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578560000.0,100,3,2,0,2020-01-09 10:58:25.058000128+02:00
2020-01-09 11:07:43.846999808+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578561000.0,99,3,2,0,2020-01-09 11:07:43.846999808+02:00
2020-01-09 11:28:24.970000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578562000.0,98,3,2,0,2020-01-09 11:28:24.970000128+02:00
2020-01-09 11:48:24.966000128+02:00,jd9INuQ5BBlW,OWd1Uau8POix,1578563000.0,97,3,2,0,2020-01-09 11:48:24.966000128+02:00


## Timestamps of all data

The timestamps of the data is contained in the index:

In [5]:
df.index

DatetimeIndex(['2020-01-09 02:20:02.924999936+02:00',
               '2020-01-09 02:21:30.405999872+02:00',
               '2020-01-09 02:24:12.805999872+02:00',
               '2020-01-09 02:35:38.561000192+02:00',
               '2020-01-09 02:35:38.953000192+02:00',
               '2020-01-09 02:37:13.282000128+02:00',
               '2020-01-09 02:37:28.365999872+02:00',
               '2020-01-09 02:37:33.743000064+02:00',
               '2020-01-09 02:41:02.887000064+02:00',
               '2020-01-09 02:41:45.263000064+02:00',
               ...
                  '2020-01-09 21:45:01.372000+02:00',
               '2020-01-09 21:52:43.584999936+02:00',
               '2020-01-09 22:12:13.954999808+02:00',
               '2020-01-09 22:21:04.062000128+02:00',
               '2020-01-09 22:42:13.961999872+02:00',
               '2020-01-09 23:02:13.938999808+02:00',
               '2020-01-09 23:10:37.262000128+02:00',
               '2020-01-09 23:22:13.966000128+02:00',
         

## Get the first or last timestamps

In [6]:
min(df.index)

Timestamp('2020-01-09 02:02:26.104999936+0200', tz='Europe/Helsinki')

In [7]:
max(df.index)

Timestamp('2020-01-09 23:39:06.800000+0200', tz='Europe/Helsinki')