# Tremor dataframe prep

In [1]:
import pandas as pd

## Step 1 - read pickle file into pandas
Because the pickle file is actually a list with one element (the dataframe), we can select the first element of the list with standard Python list indexing `[0]`

In [2]:
tremors = pd.read_pickle('../data/tremor/aghosh.pkl')[0]

## Step 2 - make column headings lowercase
This is a personal stylistic choice, but I think it helps reduce typos. It also means you don't have to guess whether or not a column has capital letters.

In [3]:
tremors.columns = tremors.columns.str.lower()
tremors.columns

Index(['year', 'month', 'day', 'hour', 'minute', 'latitude', 'longitude',
       'depth', 'errlat', 'errlon', 'errdepth'],
      dtype='object')

In [4]:
tremors.head()  # check to see the new names have been assigned as we wanted

Unnamed: 0,year,month,day,hour,minute,latitude,longitude,depth,errlat,errlon,errdepth
0,2009,6,20,1,26,48.2325,-123.065,28.0,10.423439,7.125217,17.262677
1,2009,6,20,1,28,48.1725,-123.05,23.75,2.451065,1.355398,3.774917
2,2009,6,20,9,20,48.453333,-123.066667,55.0,9.907575,2.795187,7.0
3,2009,6,20,21,3,48.403333,-123.726667,53.333333,8.934842,11.118311,31.659648
4,2009,6,21,0,42,48.03,-122.875,21.0,4.328503,7.15337,12.909944


## Step 3 - create single datetime column from date and time columns
Pandas is very helpful here. You can use `pd.to_datetime()` and give it all of the columns that make up the datetime. If they are in an order that makes sense like we have (and maybe even if not?), pandas can interpret the values and put them together into one datetime.

In [5]:
tremors['datetime'] = pd.to_datetime(tremors[['year','month','day','hour','minute']])

Note that I used some shortcut notation to specify the columns: 
```python
tremors[['year','month','day','hour','minute']]
```

I could have also used the more verbose but probably a little more technically correct:
```python
tremors.loc[:,['year','month','day','hour','minute']]
```

In [6]:
tremors.head()

Unnamed: 0,year,month,day,hour,minute,latitude,longitude,depth,errlat,errlon,errdepth,datetime
0,2009,6,20,1,26,48.2325,-123.065,28.0,10.423439,7.125217,17.262677,2009-06-20 01:26:00
1,2009,6,20,1,28,48.1725,-123.05,23.75,2.451065,1.355398,3.774917,2009-06-20 01:28:00
2,2009,6,20,9,20,48.453333,-123.066667,55.0,9.907575,2.795187,7.0,2009-06-20 09:20:00
3,2009,6,20,21,3,48.403333,-123.726667,53.333333,8.934842,11.118311,31.659648,2009-06-20 21:03:00
4,2009,6,21,0,42,48.03,-122.875,21.0,4.328503,7.15337,12.909944,2009-06-21 00:42:00


## Step 4 - move `'datetime'` to front of the dataframe
This is a totally unecessary step, but I just kind of like having the datetime in the first data column.

In [7]:
tremors =  tremors[['datetime'] + tremors.columns.tolist()[:-1]]

## Step 5 - check data types of the columns
Just looking for potential weirdness. When you read a file into pandas, you can tell it explicitly what data types to give the columns. However, pandas is pretty smart and usually can guess what data types make sense.

In [8]:
tremors.dtypes

datetime     datetime64[ns]
year                  int64
month                 int64
day                   int64
hour                  int64
minute                int64
latitude            float64
longitude           float64
depth               float64
errlat              float64
errlon              float64
errdepth            float64
dtype: object

Everything looks good. If we wanted to be really pedantic, we might say that the month should be a category not an integer because 1 (Jan) + 2 (Feb) != 3 (March), but we aren't likely to use that column anyway.

## Step 6 - save new complete df to pickle file

In [9]:
tremors.to_pickle('../data/tremor/tremors.pkl')

## Step 7 - save subset of 3000 rows to pickle file

In [10]:
tremors[:3000].to_pickle('../data/tremor/tremors_n3000.pkl')

## Step 8 - test reading pkl files
Make sure those pickle files are 

In [11]:
test_df = pd.read_pickle('../data/tremor/tremors.pkl')

In [12]:
test_df.shape

(28902, 12)

In [13]:
test_df.head()

Unnamed: 0,datetime,year,month,day,hour,minute,latitude,longitude,depth,errlat,errlon,errdepth
0,2009-06-20 01:26:00,2009,6,20,1,26,48.2325,-123.065,28.0,10.423439,7.125217,17.262677
1,2009-06-20 01:28:00,2009,6,20,1,28,48.1725,-123.05,23.75,2.451065,1.355398,3.774917
2,2009-06-20 09:20:00,2009,6,20,9,20,48.453333,-123.066667,55.0,9.907575,2.795187,7.0
3,2009-06-20 21:03:00,2009,6,20,21,3,48.403333,-123.726667,53.333333,8.934842,11.118311,31.659648
4,2009-06-21 00:42:00,2009,6,21,0,42,48.03,-122.875,21.0,4.328503,7.15337,12.909944


In [14]:
test_df.dtypes

datetime     datetime64[ns]
year                  int64
month                 int64
day                   int64
hour                  int64
minute                int64
latitude            float64
longitude           float64
depth               float64
errlat              float64
errlon              float64
errdepth            float64
dtype: object

In [15]:
test_3k = pd.read_pickle('../data/tremor/tremors_n3000.pkl')

In [16]:
test_3k.shape

(3000, 12)

In [17]:
test_3k.head()

Unnamed: 0,datetime,year,month,day,hour,minute,latitude,longitude,depth,errlat,errlon,errdepth
0,2009-06-20 01:26:00,2009,6,20,1,26,48.2325,-123.065,28.0,10.423439,7.125217,17.262677
1,2009-06-20 01:28:00,2009,6,20,1,28,48.1725,-123.05,23.75,2.451065,1.355398,3.774917
2,2009-06-20 09:20:00,2009,6,20,9,20,48.453333,-123.066667,55.0,9.907575,2.795187,7.0
3,2009-06-20 21:03:00,2009,6,20,21,3,48.403333,-123.726667,53.333333,8.934842,11.118311,31.659648
4,2009-06-21 00:42:00,2009,6,21,0,42,48.03,-122.875,21.0,4.328503,7.15337,12.909944


In [18]:
test_3k.dtypes

datetime     datetime64[ns]
year                  int64
month                 int64
day                   int64
hour                  int64
minute                int64
latitude            float64
longitude           float64
depth               float64
errlat              float64
errlon              float64
errdepth            float64
dtype: object