In [2]:
import re
from datetime import datetime

date_str_1 = "Today's date is 02/20/2019"
date_str_2 = "Today the date is 3/13/2019"
date_str_3 = "Today is 10/1/3019 and it's cold outside"
lst_pre = [date_str_1,date_str_2,date_str_3]
pattern = re.compile(r"\d{1,2}[-/]\d{1,2}[-/]\d{4}")
lst_pos = [re.search(pattern,string).group(0) for string in lst_pre]
for x,search in enumerate(lst_pos):
    dt_obj = datetime.strptime(search, "%m/%d/%Y")
    print(f"Match {x}:",search, f"\tType: {type(search)}")
    print(f"DateT {x}:",dt_obj.strftime("%m/%d/%Y"), f"\tType: {type(dt_obj)}")
    print("")

Match 0: 02/20/2019 	Type: <class 'str'>
DateT 0: 02/20/2019 	Type: <class 'datetime.datetime'>

Match 1: 3/13/2019 	Type: <class 'str'>
DateT 1: 03/13/2019 	Type: <class 'datetime.datetime'>

Match 2: 10/1/3019 	Type: <class 'str'>
DateT 2: 10/01/3019 	Type: <class 'datetime.datetime'>



In [3]:
date_str_1 = "Today’s date is 12/5/2o0o"
date_str_2 = "Today’s date is 6/o5/2017"
date_str_3 = "Today’s date is 01/1/20o7"
date_str_4 = "Today’s date is 1o/1/2o09"
date_str_5 = "Today’s date is o1/1o/2oo7"

lst_pre = [date_str_1,date_str_2,date_str_3,date_str_4,date_str_5]
pattern = re.compile(r"o?\d{1,2}o?[-/]o?\d{1,2}o?[-/]\d{1,2}(?:0o|o0|00|0|o|0o0|o0o)*\d{0,2}")
lst_pos = [re.search(pattern,string).group(0) for string in lst_pre]
for x,search in enumerate(lst_pos):
    print(f"Match {x}:",search, f"\tType: {type(search)}")
    print(f"Clean {x}:",search.replace("o","0"), f"\tType: {type(search)}")
    print("")

Match 0: 12/5/2o0o 	Type: <class 'str'>
Clean 0: 12/5/2000 	Type: <class 'str'>

Match 1: 6/o5/2017 	Type: <class 'str'>
Clean 1: 6/05/2017 	Type: <class 'str'>

Match 2: 01/1/20o7 	Type: <class 'str'>
Clean 2: 01/1/2007 	Type: <class 'str'>

Match 3: 1o/1/2o09 	Type: <class 'str'>
Clean 3: 10/1/2009 	Type: <class 'str'>

Match 4: o1/1o/2oo7 	Type: <class 'str'>
Clean 4: 01/10/2007 	Type: <class 'str'>



In [4]:
stock_sentence = "The stocks AAPL, BAC, and GE rallied in the market last week, but FAAKE or S but TSLA"
pattern = re.compile(r"\b[A-Z]{2,4}\b")
re.findall(pattern,stock_sentence)

['AAPL', 'BAC', 'GE', 'TSLA']

In [5]:
two_wrd_seq = "The markets listened to what Jerome Powell was going to say following the press conference with Mario Draghi, but JoHn smiTh"
pattern = re.compile(r"(\b[A-Z][a-z]+)\s([A-Z][a-z]+)")
re.findall(pattern,two_wrd_seq)

[('Jerome', 'Powell'), ('Mario', 'Draghi')]

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

path = "FOMC_minutes.csv"
data = pd.read_csv(path)
corp = data.statements.apply(lambda row: re.sub("\d+|[^a-zA-Z0-9]"," ",row))

vectorizer = TfidfVectorizer()
Y = vectorizer.fit_transform(corp)

df_tfidf = pd.DataFrame(columns = vectorizer.get_feature_names(),
                        data = Y.toarray())

print(df_tfidf.info(),"\n\n")
print("Shape :",df_tfidf.shape,"\n\n")
print("Size  :",df_tfidf.size,"Elements")
print("165 Rows * 1664 Cols =",165*1664,"\n\n")
x_max,y_max = df_tfidf.stack().index[np.argmax(df_tfidf.values)]
x_min,y_min = df_tfidf.stack().index[np.argmin(df_tfidf.values)]
print("Max Word:",y_max,"\tValue:",df_tfidf.loc[x_max,y_max])
print("Min Word:",y_min,"\tValue:",df_tfidf.loc[x_min,y_min])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Columns: 1664 entries, aaa to york
dtypes: float64(1664)
memory usage: 2.1 MB
None 


Shape : (165, 1664) 


Size  : 274560 Elements
165 Rows * 1664 Cols = 274560 


Max Word: bank 	Value: 0.5398033313643824
Min Word: aaa 	Value: 0.0


Interpreting the Resulting Matrix:

The TfidfVectorizer follows a similar methodology to the Count Vectorizer which itself creates a document term matrix. To understand this concept I will first explain it in laymans terms... a document-term matrix provides us a bag-of-words model, in which the module (scikit-learn) breaks up the corpus (consisting of individual documents, basically each row of the original dataframe) into individual words/strings/numbers usually based on a delimiter " ". These strings are considered tokens and they make up the columns of the vectorized dataframe, in which each row represents a different document. So the document term matrix holds individual documents as rows and tokens as columns. Therefore this becomes very useful to describe the frequency of tokens over all documents. 

Now the Term Frequency Inverse Document Frequency Vectorizer follows a unique input methodology to the term matrix. It bases the individual values on a weighting scheme that provides a frequency to evaluate the importance of a word within a document relative to the entire corpus. So the values with higher weight represent a term that is more rare in the corpus, and vice versa. The reason Tfidf differentiates itself from a regular Count Vectorizer, is that it applies a weighting formula so that a word is given a count value relative to other words. So just because the word "the" appears many times times in a document doesn't make it special, rather a Tfidf Vectorizer formulizes that since the word "the" is seen very often across all documents, then it really doesn't have much importance, and therefore receives a lower score. So a word like "bank" (max value seen above) has a high score because it is more rare than other words. Meanwhile a word like "aaa" received the lowest score because it is probably used very often!

In [7]:
df_tfidf.head(5)

Unnamed: 0,aaa,abate,abating,ability,about,above,abroad,absence,absent,accelerated,...,workers,working,works,would,written,year,years,yellen,yet,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.054593,0.0,0.0,0.0,0.060695
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070806


In [None]:
import timeit
start_time = timeit.default_timer()


elapsed = timeit.default_timer() - start_time

print(elapsed)