# Cleaning MagnaProbe Data
## This notebook demonstrates how the pandas and geopandas Python libraries clean up 'raw' magnaprobe data.
### Notice how the DataFrame changes in each cell. New columns are appended to the end.

In [33]:
import magnaprobe
help(magnaprobe)

Help on module magnaprobe:

NAME
    magnaprobe - # A Module to Clean MagnaProbe Data

FUNCTIONS
    consolidate_coords(df)
        Consolidate coordinate information that is split across many columns
    
    convert_depth_cm_to_m(df)
        Convert snow depth in cm to snow depth in m. cm column is retained.
    
    convert_wgs_to_utm(gdf, epsg_code)
        Convert WGS 84 GeoDataFrame to UTM GeoDataFrame
    
    create_geodataframe(df)
        Create GeoDataFrame with WGS 84 Spatial Reference
    
    create_geometry(df)
        Add Geometry column to specify lat and lon are special, i.e. point vector data
    
    drop_calibration_points(df, calibration_prefix, c1, c2)
        Drop calibration points.
        
        Calibration points are typically sequences of repeating or alternating ~0 cm and 
        ~120 cm depth measurements. Ideally these are keyed with a different counter 
        number when collected in the field, e.g. they all start with 99.
        But we can also c

In [34]:
raw = magnaprobe.read_tabular('example_data/Geo2_4_raw.dat', 1)
raw.head()

Unnamed: 0,TIMESTAMP,RECORD,Counter,DepthCm,BattVolts,latitude_a,latitude_b,Longitude_a,Longitude_b,fix_quality,...,altitudeB,DepthVolts,LatitudeDDDDD,LongitudeDDDDD,month,dayofmonth,hourofday,minutes,seconds,microseconds
0,TS,RN,,,,degrees,minutes,degrees,minutes,unitless,...,,,,,,,,,,
1,,,Smp,Smp,Smp,Smp,Smp,Smp,Smp,Smp,...,Smp,Smp,Smp,Smp,Smp,Smp,Smp,Smp,Smp,Smp
2,2017-11-14 11:27:00.75,5721,100001,7.283,13.75,65,2.4724,-147,-25.0191,1,...,723.9,0.48,0.04120666,-0.416985,11,14,11,27,0,750000
3,2017-11-14 15:14:14.75,5722,100002,7.732,13.78,65,2.4724,-147,-25.0163,1,...,723.1,0.51,0.04120666,-0.4169383,11,14,15,14,14,750000
4,2017-11-14 15:14:15,5723,100003,4.138,13.75,65,2.4721,-147,-25.0147,1,...,729.4,0.28,0.04120166,-0.4169117,11,14,15,14,15,0


In [35]:
magnaprobe.strip_junk_rows(raw, 2)
raw.head()

Unnamed: 0,TIMESTAMP,RECORD,Counter,DepthCm,BattVolts,latitude_a,latitude_b,Longitude_a,Longitude_b,fix_quality,...,altitudeB,DepthVolts,LatitudeDDDDD,LongitudeDDDDD,month,dayofmonth,hourofday,minutes,seconds,microseconds
2,2017-11-14 11:27:00.75,5721,100001,7.283,13.75,65,2.4724,-147,-25.0191,1,...,723.9,0.48,0.04120666,-0.416985,11,14,11,27,0,750000
3,2017-11-14 15:14:14.75,5722,100002,7.732,13.78,65,2.4724,-147,-25.0163,1,...,723.1,0.51,0.04120666,-0.4169383,11,14,15,14,14,750000
4,2017-11-14 15:14:15,5723,100003,4.138,13.75,65,2.4721,-147,-25.0147,1,...,729.4,0.28,0.04120166,-0.4169117,11,14,15,14,15,0
5,2017-11-14 15:14:15.25,5724,999001,-0.19,13.75,65,2.4704,-147,-25.0147,1,...,729.0,-0.0,0.04117334,-0.4169117,11,14,15,14,15,250000
6,2017-11-14 15:14:15.75,5725,999002,-0.13,13.74,65,2.4704,-147,-25.0147,2,...,728.9,0.0,0.04117334,-0.4169117,11,14,15,14,15,750000


In [36]:
df = magnaprobe.consolidate_coords(raw)
df.head()

Unnamed: 0,timestamp,record,counter,depthcm,battvolts,latitude_a,latitude_b,longitude_a,longitude_b,fix_quality,...,latitudeddddd,longitudeddddd,month,dayofmonth,hourofday,minutes,seconds,microseconds,Latitude,Longitude
2,2017-11-14 11:27:00.75,5721,100001,7.283,13.75,65,2.4724,-147,-25.0191,1,...,0.04120666,-0.416985,11,14,11,27,0,750000,65.041207,-147.416985
3,2017-11-14 15:14:14.75,5722,100002,7.732,13.78,65,2.4724,-147,-25.0163,1,...,0.04120666,-0.4169383,11,14,15,14,14,750000,65.041207,-147.416938
4,2017-11-14 15:14:15,5723,100003,4.138,13.75,65,2.4721,-147,-25.0147,1,...,0.04120166,-0.4169117,11,14,15,14,15,0,65.041202,-147.416912
5,2017-11-14 15:14:15.25,5724,999001,-0.19,13.75,65,2.4704,-147,-25.0147,1,...,0.04117334,-0.4169117,11,14,15,14,15,250000,65.041173,-147.416912
6,2017-11-14 15:14:15.75,5725,999002,-0.13,13.74,65,2.4704,-147,-25.0147,2,...,0.04117334,-0.4169117,11,14,15,14,15,750000,65.041173,-147.416912


In [37]:
depth_df = magnaprobe.convert_depth_cm_to_m(df)
depth_df.head()

Unnamed: 0,timestamp,record,counter,depthcm,battvolts,latitude_a,latitude_b,longitude_a,longitude_b,fix_quality,...,longitudeddddd,month,dayofmonth,hourofday,minutes,seconds,microseconds,Latitude,Longitude,Snow Depth m
2,2017-11-14 11:27:00.75,5721,100001,7.283,13.75,65,2.4724,-147,-25.0191,1,...,-0.416985,11,14,11,27,0,750000,65.041207,-147.416985,0.07283
3,2017-11-14 15:14:14.75,5722,100002,7.732,13.78,65,2.4724,-147,-25.0163,1,...,-0.4169383,11,14,15,14,14,750000,65.041207,-147.416938,0.07732
4,2017-11-14 15:14:15,5723,100003,4.138,13.75,65,2.4721,-147,-25.0147,1,...,-0.4169117,11,14,15,14,15,0,65.041202,-147.416912,0.04138
5,2017-11-14 15:14:15.25,5724,999001,-0.19,13.75,65,2.4704,-147,-25.0147,1,...,-0.4169117,11,14,15,14,15,250000,65.041173,-147.416912,-0.0019
6,2017-11-14 15:14:15.75,5725,999002,-0.13,13.74,65,2.4704,-147,-25.0147,2,...,-0.4169117,11,14,15,14,15,750000,65.041173,-147.416912,-0.0013


In [38]:
# Select which columns to keep. Maybe you want to know the voltage.
# Or maybe your MagnaProbe internals are programmed with different column names.
clean_df = magnaprobe.trim_cols(depth_df, ['timestamp', 'counter',
                                           'Latitude', 'Longitude',
                                           'Snow Depth m'])
clean_df.head()

Unnamed: 0,timestamp,counter,Latitude,Longitude,Snow Depth m
2,2017-11-14 11:27:00.75,100001,65.041207,-147.416985,0.07283
3,2017-11-14 15:14:14.75,100002,65.041207,-147.416938,0.07732
4,2017-11-14 15:14:15,100003,65.041202,-147.416912,0.04138
5,2017-11-14 15:14:15.25,999001,65.041173,-147.416912,-0.0019
6,2017-11-14 15:14:15.75,999002,65.041173,-147.416912,-0.0013


In [39]:
clean_df = magnaprobe.drop_calibration_points(clean_df, 99, 0.02, 1.18)

Starting number of rows: 251
Rows left after culling by counter prefix: 241
Rows left after culling by calibration patterns: 237


In [40]:
geom_df = magnaprobe.create_geometry(clean_df)
geom_df.head()

Unnamed: 0,timestamp,counter,Latitude,Longitude,Snow Depth m,geometry
2,2017-11-14 11:27:00.75,100001,65.041207,-147.416985,0.07283,POINT (-147.416985 65.04120666)
3,2017-11-14 15:14:14.75,100002,65.041207,-147.416938,0.07732,POINT (-147.4169383 65.04120666)
4,2017-11-14 15:14:15,100003,65.041202,-147.416912,0.04138,POINT (-147.4169117 65.04120166)
15,2017-11-14 15:24:38.25,100001,65.04058,-147.417213,-0.0013,POINT (-147.4172134 65.04058000000001)
16,2017-11-14 15:24:38.5,100002,65.040293,-147.41795,0.7892,POINT (-147.41795 65.04029333)


In [41]:
geo_df = magnaprobe.create_geodataframe(geom_df)
geo_df.head()

Unnamed: 0,timestamp,counter,Latitude,Longitude,Snow Depth m,geometry
2,2017-11-14 11:27:00.75,100001,65.041207,-147.416985,0.07283,POINT (-147.41699 65.04121)
3,2017-11-14 15:14:14.75,100002,65.041207,-147.416938,0.07732,POINT (-147.41694 65.04121)
4,2017-11-14 15:14:15,100003,65.041202,-147.416912,0.04138,POINT (-147.41691 65.04120)
15,2017-11-14 15:24:38.25,100001,65.04058,-147.417213,-0.0013,POINT (-147.41721 65.04058)
16,2017-11-14 15:24:38.5,100002,65.040293,-147.41795,0.7892,POINT (-147.41795 65.04029)


In [42]:
# 32606 is the EPSG code for UTM Zone 6 N
utm_geo_df = magnaprobe.convert_wgs_to_utm(geo_df, 32606)
utm_geo_df.head()

Unnamed: 0,timestamp,counter,Latitude,Longitude,Snow Depth m,geometry
2,2017-11-14 11:27:00.75,100001,65.041207,-147.416985,0.07283,POINT (480366.787 7213111.766)
3,2017-11-14 15:14:14.75,100002,65.041207,-147.416938,0.07732,POINT (480368.986 7213111.752)
4,2017-11-14 15:14:15,100003,65.041202,-147.416912,0.04138,POINT (480370.235 7213111.186)
15,2017-11-14 15:24:38.25,100001,65.04058,-147.417213,-0.0013,POINT (480355.573 7213041.998)
16,2017-11-14 15:24:38.5,100002,65.040293,-147.41795,0.7892,POINT (480320.679 7213010.279)
