### Python Cookbook
Matching and searching for terms



matching and searching for text patterns.

for simple searches we can get away with 
- str.find()
- str.endswith()
- str.startswith()

In [15]:
text = 'yeah, but no, but yeah, but no, but yeah'

text == 'yeah'

False

In [16]:
text.startswith('yeah')

True

In [17]:
text.endswith('yeah')

True

In [19]:
# for matching more complex stuff we fall back to regular expressions
text1 = '11/27/2012'
text2 = 'Nov 27 2012'

import re
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')

yes


In [36]:
phone_num_regex = re.compile(r'(d+)(-d+)(-d+)')

In [37]:
if phone_num_regex.match('780-203-6089'):
    print('yes')
else:
    print('no')

no


In [2]:
# questions for testing
from typing import Iterable, Set
import re
terms = ["Apple", "Google", "Facebook", "FedEx", "Tesla","Waymo"]


# def strip_excess_spaces(raw_quote):
#     '''
#     removes newlines and tabs, and replaces with single
#     '''
    
#     return re.sub('\s+', ' ', raw_quote)


# def extract_quoted_terms(text: str, terms: Iterable[str]) -> Set[str]:
#     '''
#     Extracts company names from passage.
#     '''
    
#     quotes_list = set()
#     companies_found = set()
    
#     # sanitize text and remove \n and \t
#     sanitized_raw_text = strip_excess_spaces(text)
    
#     # search for expected: double quotes
#     double_quote_ptn = re.compile(r'\"(.*?)\"')
#     quotes_list.update(double_quote_ptn.findall(sanitized_raw_text))
    
#     # search for rounded quotes
#     rounded_quote_ptn = re.compile(r'\“(.*?)\”')
#     quotes_list.update(rounded_quote_ptn.findall(sanitized_raw_text))
    
#     def sanitize_quoted_result(mystring):
#         '''
#         Remove special chars
#         '''
#         sanitized = re.sub('[^A-Za-z0-9]+', ' ', mystring).strip()
#         return sanitized
        
#     # step 2 search terms
#     for quote in quotes_list:
#         _sanitized = sanitize_quote(quote)
#         matches = list(filter(lambda term: term in _sanitized, terms))
#         companies_found.update(matches)

#     return companies_found

In [3]:
# FINAL VERSION

def strip_excess_spaces(raw_quote):
    '''
    removes newlines and tabs, and replaces with single
    '''
    
    return re.sub('\s+', ' ', raw_quote)


def sanitize_quoted_result(quote):
    '''
    Removes special chars from quoted text\n
    replacing with single space
    '''

    return re.sub('[^A-Za-z0-9]+', ' ', quote).strip()


def extract_quoted_terms(text: str, terms: Iterable[str]) -> Set[str]:
    '''
    Extracts company names from passage.
    '''

    quotes_list = set()
    companies_found = set()

    # remove \n \t
    _raw_text = strip_excess_spaces(text)
    
    # search for expected: double quotes
    double_quote_ptn = re.compile(r'\"(.*?)\"')
    quotes_list.update(double_quote_ptn.findall(_raw_text))
    
    # search for rounded quotes
    rounded_quote_ptn = re.compile(r'\“(.*?)\”')
    quotes_list.update(rounded_quote_ptn.findall(_raw_text))

    # search for missing closing quote
    missing_end_quote_ptn = re.compile(r'\"(.*)')
    quotes_list.update(missing_end_quote_ptn.findall(_raw_text))

    print('quotes list --- ',quotes_list)
        
    # step 2 search terms
    for quote in quotes_list:
        _sanitized = sanitize_quoted_result(quote)
        # matches = list(filter(lambda term: term in _sanitized, terms))
        matches = list(filter(lambda term: re.search(r'\b%s\b' % term , _sanitized), terms))
        companies_found.update(matches)

    return companies_found

In [26]:
_pattern = r'|'.join([
        r'\"(.*?)\"$',
        r'(\“(.*?)\”)$',
        r'^\"(.*)?'
        ]
    )


In [42]:
search_regx = re.findall(_pattern, '"Apple"')

In [44]:
results = set()
as_list = [ results.update(r_tup) for r_tup in search_regx]

results

{'', 'Apple'}

In [5]:
assert extract_quoted_terms('"Apple', terms) == {"Apple"}

quotes list ---  {'Apple'}


In [20]:
missing_end_quote_ptn = re.compile(r'^\"(.*)?')
print('missing ', missing_end_quote_ptn)
missing_end_quote_ptn.findall('"Apple')

missing  re.compile('^\\"(.*)?')


['Apple']

In [129]:
# test_case 1:empty 
extract_quoted_terms("", terms)

set()

In [64]:
# test case 2: empty
extract_quoted_terms(" ", terms)

found --  []


set()

In [65]:
# test case 3: empty
extract_quoted_terms(" ", terms)

found --  []


set()

In [66]:
# test case 4: empty terms
extract_quoted_terms('"Apple"', [])

found --  ['', '']


set()

In [67]:
# 5
assert extract_quoted_terms("apple", terms) == set()

found --  []


In [68]:
assert extract_quoted_terms("Apple", terms) == set()

found --  []


In [69]:

assert extract_quoted_terms('"Apple"', terms) == {"Apple"}

found --  ['', '']


AssertionError: 

In [70]:
assert extract_quoted_terms('"Ap ple"', terms) == set()


found --  ['', '']


In [71]:
assert extract_quoted_terms('"Google Apple"', terms) == {"Google", "Apple"}

found --  ['', '']


AssertionError: 

In [54]:
assert extract_quoted_terms('"Google Apple Facebook"', terms) == {
        "Google",
        "Apple",
        "Facebook",
    }

In [55]:
assert extract_quoted_terms('"Google. Apple! FedEx?"', terms) == {
        "Google",
        "Apple",
        "FedEx",
    }

In [56]:
extract_quoted_terms(
        "\"Apple's CEO said, Google is our primary competitor", terms
    ) == {"Apple", "Google"}

True

In [57]:
assert extract_quoted_terms('"FedEx"', terms) == {"FedEx"}

In [1]:
# Keeping the last N Items

In [2]:
from collections import deque

def search(lines, pattern, history=5):
    previous_lines = deque(maxlen=history)
    for line in lines:
        if pattern in line:
            yield line, previous_lines
        previous_lines.append(line)


In [6]:
for line, previous_lines in search(['This is python', 'This is something', 'yellow', ''], 'python', 3):
    for pline in previous_lines:
        print(pline)