# getting started with ELA

We will be using data publicly available from the BoM and GA. Ackn, copyrights and the like to be included. 

Aiming to have a 3D grid of primary lithologies.


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show

import ela

In [None]:
data_path = '/home/xxxyyy/data'
bungendore_raster = rasterio.open(os.path.join(data_path, 'ela/CLIP.tif'))

In [None]:
show(bungendore_raster,title='East of Bungendore, AU', cmap='terrain')

In [None]:
# after conda install petl
import petl as etl
bidgee_path = os.path.join(data_path, 'gw_shp_murrumbidgee_river/shp_murrumbidgee_river')
lithology_logs = etl.fromcsv(os.path.join(bidgee_path, 'NGIS_LithologyLog.csv'))

In [None]:
lithology_logs.columns

In [None]:
lithology_logs = pd.read_csv(os.path.join(bidgee_path, 'NGIS_LithologyLog.csv'))

In [None]:
lithology_logs.head()

In [None]:
LITHO_DESC_COL = 'Description'

In [None]:
descs = lithology_logs[LITHO_DESC_COL]
descs = descs.reset_index()
descs = descs[LITHO_DESC_COL]
descs.head()

In [None]:
def clean_desc(x):
    if isinstance(x, float):
        return ''
    elif x is None:
        return ''
    else:
        return x

In [None]:
y = [clean_desc(x) for x in descs]

In [None]:
from striplog import Lexicon
lex = Lexicon.default()
# quite slow: maybe not necessary.
# d = [lex.expand_abbreviations(x) for x in d]

We get a flat list of all the "tokens" but remove stop words ('s', 'the' and the like)

In [None]:
from ela.textproc import *
from ela.utils import *
from ela.classification import *

In [None]:
y = v_lower(y)
vt = v_word_tokenize(y)
flat = np.concatenate(vt)

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
stoplist = stopwords.words('english')
exclude = stoplist + ['.',',',';',':','(',')','-']
flat = [word for word in flat if word not in exclude]

In [None]:
len(set(flat))

In [None]:
df_most_common= token_freq(flat, 50)

In [None]:
plot_freq(df_most_common)

There are terms such as 'sandy', 'clayey', 'silty' and so on. Let's define functions to detect terms derived from lithology classes, and their frequency. Given the likely skewness, we use a y log scale. 

In [None]:
#['sand','sandstone','clay','limestone','shale','basalt','coffee']
plot_freq_for_root(flat, 'sand')

In [None]:
plot_freq_for_root(flat, 'clay')

In [None]:
df_most_common

In [None]:
# I leave 'basalt' out, as it was mentioned it may be a mistake in the raw log data.
lithologies = ['clay','sand','gravel','granite','shale','silt','soil','loam','sandstone']

In [None]:
any_litho_markers_re = r'sand|clay|ston|shale|silt|granit|soil|gravel|loam'
regex = re.compile(any_litho_markers_re)

In [None]:
lithologies_dict = dict([(x,x) for x in lithologies])
lithologies_dict['sands'] = 'sand'
lithologies_dict['clays'] = 'clay'
lithologies_dict['shales'] = 'shale'
lithologies_dict['claystone'] = 'clay'
lithologies_dict['siltstone'] = 'silt'
lithologies_dict['limesand'] = 'sand' # ??
lithologies_dict['calcarenite'] = 'limestone' # ??
lithologies_dict['calcitareous'] = 'limestone' # ??
lithologies_dict['mudstone'] = 'silt' # ??
lithologies_dict['capstone'] = 'limestone' # ??
lithologies_dict['ironstone'] = 'sandstone' # ??
lithologies_dict['topsoil'] = 'soil' # ??

In [None]:
lithologies_adjective_dict = {
    'sandy' :  'sand',
    'clayey' :  'clay',
    'clayish' :  'clay',
    'shaley' :  'shale',
    'silty' :  'silt',
    'gravelly' :  'gravel'
}

In [None]:
y[11]

In [None]:
v_tokens = v_word_tokenize(y)
litho_terms_detected = v_find_litho_markers(v_tokens, regex=regex)

Let's see if we detect these lithology markers in each bore log entries  

In [None]:
zero_mark = [x for x in litho_terms_detected if len(x) == 0 ]
at_least_one_mark = [x for x in litho_terms_detected if len(x) >= 1]
at_least_two_mark = [x for x in litho_terms_detected if len(x) >= 2]
print('There are %s entries with no marker, %s entries with at least one, %s with at least two'%(len(zero_mark),len(at_least_one_mark),len(at_least_two_mark)))

Note: probably need to think of precanned facilities in ela to assess the detection rate in such EDA. Maybe wordcloud not such a bad idea too.

In [None]:
descs_zero_mark = [y[i] for i in range(len(litho_terms_detected)) if len(litho_terms_detected[i]) == 0 ]

In [None]:
descs_zero_mark[1:50]

In [None]:
[x for x in zero_mark if len(x) > 0]