## Text Processing Code

In [None]:
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer # BOW
from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF
from sklearn.metrics.pairwise import cosine_similarity # Cosine Similarity


### Run code below once to download NLTK resources.

In [None]:
nltk.download('stopwords') 
nltk.download('wordnet')   

### CODE TINKERING: Bag of Words (BOW)**

In our example, there are only 3 documents in our corpus.

In [None]:
docs = [
    'John has some cats.',
    'Cats, being cats, eat fish.',
    'I ate a big fish.'
]

#### Prepare Stemmer and Stop-Words.

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

### Perform data cleansing.

In [None]:
def preprocess(docs):
    docs_clean = []
    punc = str.maketrans('', '', string.punctuation)
    for doc in docs:
        doc_no_punc = doc.translate(punc)      # remove punctuations
        words = doc_no_punc.lower().split()    # convert to lowercase and convert into list
        words = [lemmatizer.lemmatize(word, 'v')
                        for word in words if word not in stop_words]    # place word into words list if not already inside
        docs_clean.append(' '.join(words))
    
    return docs_clean

docs_clean = preprocess(docs)
docs_clean #contain words that do not contain any punctuations, stopwords

### Generate our Feature Vectors using Bag of Words.

In [None]:
bow = CountVectorizer()

feature_vectors = bow.fit_transform(docs_clean).toarray() # convert to numpy array
feature_vectors

### View our vocabulary (every unique word in our corpus is a feature)

In [None]:
vocab = bow.get_feature_names()

### Pretty-print our BOW results by combining our vocabulary and feature-vectors into a Pandas' dataframe.

In [None]:
df = pd.DataFrame(data=feature_vectors,
                index=['doc1', 'doc2', 'doc3'],
                columns=vocab)

## CODE TINKERING: TF-IDF**

### Generate feature vectors using TF-IDF.

In [None]:
tfidf = TfidfVectorizer()

#input the preprocessed list of words into the tfidf
feature_vectors = tfidf.fit_transform(docs_clean).toarray()
feature_vectors

### View our vocabulary.

In [None]:
vocab = tfidf.get_feature_names()
vocab

Pretty-print our TF-IDF results.

In [None]:
df = pd.DataFrame(data=feature_vectors,
                index=['doc1', 'doc2', 'doc3'],
                columns=vocab)

df

**CODE TINKERING: Cosine Similarity**

Given a query string, compare it with the corpus for similarity.

In [None]:
query = ['cats and fish']

Preprocess our query string.

In [None]:
query_clean = preprocess(query)
query_clean

Use TF-IDF feature-vectors in our Cosine Similarity computation
.

In [None]:
query_feature_vector = tfidf.transform(query_clean).toarray()
query_feature_vector

Pretty-print our query's feature vector.

In [None]:
query_df = pd.DataFrame(data=query_feature_vector,
                        index=['query string'],
                        columns=vocab)

query_df

Compute Cosine Similarity between the feature vectors.

In [None]:
similarity = cosine_similarity(query_feature_vector, feature_vectors)

cs = pd.DataFrame(data=similarity,
                index=['cosine similarity'],
                columns=['doc1', 'doc2', 'doc3'])

cs