## Regex Practice

In [23]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])

In [24]:
print(df)

                                                text
0     Monday: The doctor's appointment is at 2:45pm.
1  Tuesday: The dentist's appointment is at 11:30...
2  Wednesday: At 7:00pm, there is a basketball game!
3  Thursday: Be back home by 11:15 pm at the latest.
4  Friday: Take the train at 08:10 am, arrive at ...


In [22]:
# Length of each string
print(df['text'].str.len())

# Number of tokens for each string
print(df['text'].str.split().str.len())

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64
0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64


In [31]:
# match all the time
df['text'].str.findall(r'\d{1,2}:\d{1,2}\s*[ap]m')

0               [2:45pm]
1             [11:30 am]
2               [7:00pm]
3             [11:15 pm]
4    [08:10 am, 09:00am]
Name: text, dtype: object

In [32]:
# only pick out the time number
df['text'].str.findall(r'(\d{1,2}):(\d{1,2})\s*[ap]m')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [42]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'^[A-Z][a-z]*ay', lambda x: x[0][:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [45]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


#### A simple quiz

In [46]:
# pick the date in xx/xx/xxxx format from series of strings and sort by date
def date_sorter(df):
    
    df1 = df.str.extractall(r'[^.](?P<time>(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4}))')
    df1 = df1.apply(pd.to_numeric, args=('coerce',))
    df1['year'] = df1['year'].apply(lambda x: (x+1900) if x < 100 else x)
    df1 = df1.sort_values(by = ['year', 'month', 'day'])

    return pd.Series(df1.index.get_level_values(0))

## NLTK Basic

In [62]:
import nltk
from nltk.book import text7

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


#### Word Dist

In [73]:
# set of the words
print(len(set(text7)))

# frequency distribution
dist = nltk.FreqDist(text7)
print(dist['the'])
print([w for w in dist.keys() if len(w) > 5 and dist[w] > 100])

12408
4045
['billion', 'company', 'president', 'because', 'market', 'million', 'shares', 'trading', 'program']


#### Tokenization

In [78]:
test_text = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"

words = nltk.word_tokenize(test_text)
print(len(words))

sentences = nltk.sent_tokenize(test_text)
print(len(sentences))

28
4


#### Part-Of-Speech Tagger (POS Tagger)

In [82]:
nltk.pos_tag(words)[:10]

[('This', 'DT'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('gallon', 'NN'),
 ('of', 'IN'),
 ('milk', 'NN')]

In [3]:
import re
import pandas as pd
import numpy as np 

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xinghao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_clean(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join([x for x in text.split() if x not in STOPWORDS])
    
    return text

In [24]:
def test_text_prepare():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    
    for ex, ans in zip(examples, answers):
        if text_clean(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [25]:
test_text_prepare()

sql server equivalent excels choose function sql server equivalent excels choose function
free c++ memory vectorint arr free c++ memory vectorint arr


'Basic tests are passed.'

In [43]:
from collections import Counter
Counter('Implement the described encoding in the function my_bag_of_words with size of the dictionary.'.split())

Counter({'Implement': 1,
         'the': 3,
         'described': 1,
         'encoding': 1,
         'in': 1,
         'function': 1,
         'my_bag_of_words': 1,
         'with': 1,
         'size': 1,
         'of': 1,
         'dictionary.': 1})

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

text = ['This is the first document.', 
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?']

tf_vectorizer = TfidfVectorizer(min_df = 1.0)
tt = tf_vectorizer.fit_transform(text)
print(tt.shape)
print(tt)

c_vectorizer = CountVectorizer()
c_vectorizer.fit(text)
print(c_vectorizer.vocabulary_)




(4, 3)
  (0, 2)	0.5773502691896258
  (0, 0)	0.5773502691896258
  (0, 1)	0.5773502691896258
  (1, 2)	0.5773502691896258
  (1, 0)	0.5773502691896258
  (1, 1)	0.5773502691896258
  (2, 2)	0.5773502691896258
  (2, 0)	0.5773502691896258
  (2, 1)	0.5773502691896258
  (3, 2)	0.5773502691896258
  (3, 0)	0.5773502691896258
  (3, 1)	0.5773502691896258
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
