# String utils

## word matching

In [None]:
import string
import random
import re 

from typing import Sequence, Iterable, List
import numpy as np

In [None]:
sentences = ['the Big cat went to the bar', 
             'the cat liked the big piano', 
             'the big fish never eats the bigger fish']

In [None]:
reg = re.compile(r'(?i)\bbig\b')


In [None]:
reg.findall(sentences[0])

In [12]:
import numpy as np 
%timeit x = np.ones(5_000_000, dtype=bool)

128 µs ± 5.54 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [7]:
%timeit x * x

171 µs ± 1.37 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
import pyarrow as pa
x_pa = pa.array(x)

In [11]:
%timeit pa.compute.and_(x_pa,x_pa)

37.9 µs ± 467 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
%%timeit
get_indices_sentences_matching_words_regex(words=['big', 'cat'], sentences=sentences)

Now we can run the parallel version

In [None]:
%%timeit
get_indices_sentences_matching_words_regex_parallel(words=['big', 'cat'], sentences=sentences, n_jobs=3)

Now let us try a big example

In [None]:
pool_characters = string.ascii_lowercase

def create_random_word(min_chars=2, max_chars=15, pool_characters=None):
    n_chars = random.randint(min_chars,max_chars)
    return ''.join((random.choice(pool_characters) for x in range(n_chars)))

def create_random_sentence(n_words,pool_characters):
    return ' '.join( [create_random_word(pool_characters=pool_characters) for word in range(n_words) ])


In [None]:
create_random_word(pool_characters= pool_characters)

In [None]:
create_random_sentence(10,pool_characters)

In [None]:
words = 'hi' 

In [None]:
%%time
n_sentences = 2_000_000
n_words = 7
corpus = [create_random_sentence(7, pool_characters) for x in range(n_sentences)]

In [None]:
corpus[0:10]

In [None]:
corpus_14million = 7*[*corpus]

In [None]:
%%time
get_indices_sentences_matching_words_regex(words=['big'], sentences=corpus_14million);

In [None]:
res = get_indices_sentences_matching_words_regex(words=['big'], sentences=corpus_14million);

In [None]:
res_par = get_indices_sentences_matching_words_regex_parallel(words=['big'], sentences=corpus_14million);

In [None]:
%%time
get_indices_sentences_matching_words_regex_parallel(words=['big'], sentences=corpus_14million);

In [None]:
res == res_par

### testing pyarrow

In [None]:
import pyarrow

In [None]:
pyarrow.set_cpu_count(1)

In [None]:
from pyarrow.compute import extract_regex

In [None]:
import re
pattern = re.compile(r'\bbig\b', re.IGNORECASE)

In [None]:
res = extract_regex(corpus_14million, pattern=r'\bbig\b')

In [None]:
res.is_valid().sum()

In [None]:
res.is_valid() * res.is_valid()

In [None]:
pyarrow.compute.and(res.is_valid(),res.is_valid())

In [None]:
import pyarrow as pa

x = pa.array([True]*1000)
y = pa.array([False]*1000)


In [None]:
%timeit x = pa.array([True]*1000_000)

In [None]:
x = pa.array([True]*1000_000)

In [None]:
import numpy as np

In [None]:
%timeit x = pa.array(np.ones(1_000_000, dtype=bool))

In [None]:
np.ones(1_000_000, dtype='bool')

In [None]:
%timeit x = pa.array(np.ones(1_000_000))

In [None]:
x = pyarrow.compute.and_(x,y)

In [None]:
x

In [None]:
pyarrow.compute.and_(res.is_valid(),res.is_valid())

In [None]:
pyarrow.compute.and_(res.is_valid(),res.is_valid())

In [None]:
np.where(np.array(res.is_valid()))[0].tolist()

In [None]:
%%time
res = extract_regex(corpus_14million, pattern=r'\bbig\b/i')

In [None]:
%%time
res_arrow = extract_regex(corpus_14million, pattern=r'(?i)\bbig\b')
res_arrow_np = np.array(res_arrow.is_valid())
indices = np.where(res_arrow_np)[0]

In [None]:
res_arrow_np.sum()

In [None]:
res_arrow_np

In [None]:
res

In [None]:
S = ["Big said the man", 'big boy','bigger than I want']
res_arrow = extract_regex(S,
                          pattern=r'(?i)\bbig\b')

In [None]:
res_arrow

In [None]:
get_indices_sentences_matching_words_regex(words=['big'], sentences=S )

In [None]:
#np.where(res_arrow_np)[0]

## Test loading + regex 

In [None]:
corpus_14million

In [None]:
%timeit pyarrow.compute.sum(res_arrow.is_valid())

In [None]:
res_par = get_indices_sentences_matching_words_regex_parallel(words=['big'], sentences=corpus_14million);

In [None]:
len(res_par)

In [None]:
#corpus_14million

In [None]:
res = extract_regex(corpus_14million, pattern=r'big')

In [None]:
len(res)

In [None]:
import numpy as np
res_arrow = np.array(res_arrow.is_valid())

In [None]:
res_arrow == np.array(res_par)

In [None]:
len(np.where(res_arrow)[0])

In [None]:
len(res_par)

In [None]:
res_arrow.shape

In [None]:
!open .

Adding more data

In [None]:
corpus_40million = 20*[*corpus]

In [None]:
%%time
res_arrow = extract_regex(corpus_40million,
                          pattern=r'(?i)\bbig\b')

In [None]:
%%time
get_indices_sentences_matching_words_regex(words=['big'], sentences=corpus_40million);

In [None]:
%%time
get_indices_sentences_matching_words_regex_parallel(words=['big'],
                                                    sentences=corpus_40million,
                                                    n_jobs=3);

In [None]:
%%time
get_indices_sentences_matching_words_regex_parallel(words=['big'],
                                                    sentences=corpus_40million,
                                                    n_jobs=6);

In [None]:
%%time
get_indices_sentences_matching_words_regex_parallel(words=['big'],
                                                    sentences=corpus_40million,
                                                    n_jobs=10);

In [None]:
from gaia_science.common_base.dataframe_tools.utils import get_batches

In [None]:
%%time
n_jobs = 8
n_sentences = len(corpus_40million)
n_batch = int(n_sentences/n_jobs)

partial_start_positions = [0] + list(np.cumsum([len(x) for x in get_batches(range(n_sentences), n_batch)]))

### Edit distance

In [None]:
from gaia_science.common_base.string_utils.string_distances.edit_distance import EditDistanceInt

In [None]:
d = EditDistanceInt(1,1,1)
d.evaluate('hi','ho')

### BKtree

In [None]:
from gaia_science.common_base.string_utils.data_structures.bktree import BKTree

In [None]:
import random
import string

def build_vocab(n_words, n_min_per_word, n_max_per_word):
    letters = string.ascii_lowercase
    random.seed(10)
    all_words = []
    for w in range(n_words):
        n = random.randint(n_min_per_word, n_max_per_word)
        all_words.append(''.join(random.choice(letters) for i in range(n)))
                         
    return all_words

In [None]:
vocabulary = build_vocab(500_000, 2, 18)

In [None]:
query = 'help'
max_dist = 1

In [None]:
t1 = %timeit -o  candidates = [w for w in vocabulary if d.evaluate(query,w) <=max_dist]

In [None]:
candidates.sort()
candidates

In [None]:
bktree = BKTree(d.evaluate)

In [None]:
%%time
bktree.fit(vocabulary)

In [None]:
t2 = %timeit -o candidates = bktree.query(query, max_dist=max_dist)

In [None]:
candidates.sort()
candidates

In [None]:
t1.average/t2.average

In [None]:
import pandas as pd

In [None]:
results = pd.DataFrame()

In [None]:
partial = pd.DataFrame({'a':[1,2,3],'b':['a','c','d']})

In [None]:
partial

In [None]:
results = pd.concat((results,partial))

In [None]:
results

In [None]:
results = pd.concat((results,partial))

In [None]:
results.reset_index(drop=True)

In [None]:
def append_many_df(initial_df, n=10_000):
    df_res = pd.DataFrame()
    for i in range(n):
        df_res = pd.concat((df_res, initial_df))
    return df_res.reset_index(drop=True)

In [None]:
initial_df = pd.DataFrame({'a':[1,2,3],'b':['a','c','d']})

In [None]:
%%time
append_many_df(initial_df)

In [None]:
def append_many_df_as_list(initial_df, n=10_000):
    df_res = pd.DataFrame()
    for i in range(n):
        df_res = pd.concat((df_res, initial_df))
    return df_res.reset_index(drop=True)

In [None]:
%%
append_many_df(initial_df)