# Chapter 1: Basic methods

### string.find(substring, start, end)

In [10]:
my_string = "Where's Waldo?"

In [11]:
my_string.find("Waldo")

8

In [12]:
my_string.find("Weldo")

-1

In [19]:
try:
    my_string.find("Weldo")
except ValueError:
    print("Not found")

### string.count(substring, start, end)

In [24]:
my_string = "How many fruits do you have in your fruit basket?"

In [26]:
my_string.count("fruit", 0, 16)

1

In [27]:
my_string.count("Bayern")

0

### string.replace(old, new, count)

# Chapter 2: Formatting strings

### Methods for formatting:
    
- Positional formatting ( 'text{}'.format() )
- Formatted string laterals ( f"literal string {expression}" )
- Template method ( from string import Template my_string = Template("Data science has been called $identifier")

# Chapter 3: Regular expressions in Python

###### r'st\d\s\w{3,10}'

r indicates a raw string 

st here indicates normal characters that match themselves

\d digit

\s white space

\w word

{3,10} character to the left of \w should appear between 3 and 10 times


###### import re

re.findall(r"regex", string)

re.split(r"regex", string)

re.sub(r"regex", new, string)

###### \D non-digit

\W non-word

###### Nota bene: \s for white spaces is desirable cause white space may be space, tab, new line

## Repeated characters

######  Quantifiers

Zero or once: ?

Zero or more: *

Once or more: +

n times at least, m times at mos: {n, m}


###### Note: 

Quantifiers are applied to the characted immediately to the left

r"apple+" : + applies to "e", not to "apple"

### Regex metacharacters

###### Two different operations:

re.search(r"regex", string)

re.match(r"regex", string)

###### Special characters:
    
Match any character (except newline): .

Start of the string: ^

End of the string: $

Escape special characters: \

OR operator: |

Set of characters: []                 ^ here transforms to the negative

## Greedy vs. non-greedy matching

###### Two different ways

Greedy: match as many characters as possible; return the longest match

Non-greedy (lazy): match as few characters as needed; return the shortest match

Append ? to get lazy

# Chapter 4: Advanced regular expression

###### Groups: ()

In [4]:
import re

In [7]:
my_string = "My lucky numbers are 8755 and 33"

In [8]:
re.findall(r"(\d+)", my_string)

['8755', '33']

In [9]:
re.findall("(\d)+", my_string)

['5', '3']

#### Alternation and non-capturing groups

###### Match but not capture group: 
    
Add ?:   (?:regex)

In [10]:
my_string = "John Smith: 34-34-34-042-980, Rebeca Smith: 10-10-10-434-425"

In [11]:
re.findall(r"(?:\d{2}-){3}(\d{3}-\d{3})", my_string)

['042-980', '434-425']

In [12]:
my_date = "Today is 23rd May 2019. Tomorrow is 24th May 19"

In [13]:
re.findall(r"(\d+)(?:th|rd)", my_date)

['23', '24']

#### Backreferences

In [14]:
text = "Python 3.0 was released on 12-03-2008"

In [17]:
information = re.search("(\d{1,2})-(\d{2})-(\d{4})", text)

In [18]:
information.group(3)

'2008'

In [19]:
information.group(0)

'12-03-2008'

##### We can give names to groups

(?Pnameregex)

In [20]:
text = "Austin, 78701"

In [22]:
cities = re.search(r"(?P<city>[A-Za-z]+).*?(?P<zipcode>\d{5})", text)

In [24]:
cities.group('city')

'Austin'

#### Backreferencing by \1

In [25]:
sentence = "I wish you a happy happy birthday!"

In [27]:
re.findall(r"(\w+)\s\1", sentence)

['happy']

In [28]:
re.sub(r"(\w+)\s\1", r"\1", sentence)

'I wish you a happy birthday!'

In [29]:
sentence2 = "Your new code number is 23434. Please, enter 23434 to open the door"

In [31]:
re.findall(r"(?P<code>\d{5}).*?(?P=code)", sentence2)

['23434']

###### Use \g to backtrack for replacing

In [32]:
sentence3 = "This app is not working! It's repeating the last word word"

In [34]:
re.sub(r"(?P<word>\w+)\s(?P=word)", r"\g<word>", sentence3)

"This app is not working! It's repeating the last word"

### Lookaround

###### Look-ahead for the word "cat": 


"the white cat sat on the chair"

positive (?=sat) 

negative (?!run)

In [1]:
my_text = "tweets.txt transferred, mypass.txt transferred, keywords.txt error"

In [4]:
re.findall(r"\w+\.txt(?=\stransferred)", my_text)

['tweets.txt', 'mypass.txt']

In [6]:
re.findall(r"\w+\.txt(?!\stransferred)", my_text)

['keywords.txt']

###### Look-behind for the word "cat"

"the white cat sat on the chair"

positive (?<=white)

negative (?<!brown)

In [7]:
my_text = "Member: Angus Young, Member: Chris Slade, Past: Malcolm Young, Past: Cliff Williams"

In [8]:
re.findall(r"(?<=Member:\s)\w+\s\w+",my_text)

['Angus Young', 'Chris Slade']

In [11]:
my_text = "My white cat sat at the table. However, my brown dog was lying on the couch."

In [12]:
re.findall(r"(?<!brown\s)(cat|dog)", my_text)

['cat']

# Experiments with schools

In [11]:
school_text1 = "Лот №58 крупи та борошно, НВК №8"
school_text2 = "Капітальний ремонт будівлі ДНЗ 126"
filter_keywords_strict = ['зош', 'знз', 'сзш', '[ун]вк', 'днз']

In [2]:
import pandas as pd

In [3]:
school_text1_s = pd.Series(school_text1)
school_text2_s = pd.Series(school_text2)

In [4]:
school_text1_s.astype(str).str.contains(r'{}(\s)+[№\w](\d)+'.format('|'.join(filter_keywords_strict)), case = False, na = False, regex = True)

  """Entry point for launching an IPython kernel.


0    True
dtype: bool

In [5]:
school_text2_s.astype(str).str.contains(r'{}(\s)+[№\w](\d)+'.format('|'.join(filter_keywords_strict)), case = False, na = False, regex = True)

  """Entry point for launching an IPython kernel.


0    True
dtype: bool

In [12]:
import re
k = '' 
if school_text1_s.astype(str).str.contains(r'{}(\s)+[№\w](\d)+'.format('|'.join(filter_keywords_strict)), case = False, na = False, regex = True).any():
    test = str(school_text1_s.astype(str))
    print('Це так виглядає стрінга: ', end='')
    print(test)
    m = re.search(r'{}(\s)+[№\w](\d)+'.format('|'.join(filter_keywords_strict)), test, re.IGNORECASE)
    idx1, idx2 = m.span()[0], m.span()[1]
    return_pattern = test[idx1:idx2] + " "
    running_idx = idx2+2
    while True:
        try: 
            b = int(test[running_idx])
            running_idx += 1
        except ValueError:
            break
    return_pattern += test[idx2:running_idx]
    if not "№" in return_pattern:
        return_pattern = return_pattern[:-1]
    print("А ось що я витягнув: ", end='')
    print(return_pattern.rstrip())
    

Це так виглядає стрінга: 0    Лот №58 крупи та борошно, НВК №8
dtype: object
А ось що я витягнув: НВК  №8


  This is separate from the ipykernel package so we can avoid doing imports until
