# Text representation

Code notebook for TAHLR Working Group (Spring 2024) based on:  

- Vajjala, S., Majumder, B., Gupta, A., and Surana, H. 2020. *Practical Natural Language Processing: A Comprehensive Guide to Building Real-World NLP Systems*. Sebastopol, CA: O’Reilly Media.

More info on book here: https://www.oreilly.com/library/view/practical-natural-language/9781492054047/

In [None]:
# Imports

from pprint import pprint
import pandas as pd 

## One-hot encoding

In [None]:
# Process docs

documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

In [None]:
# Build a vocabulary

vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count +1
            vocab[word] = count
print(vocab)

In [None]:
# Get one-hot encoded vector for a doc

def get_onehot_vector(somestring):
  onehot_encoded = []
  for word in somestring.split():
             temp = [0]*len(vocab)
             if word in vocab:
                        temp[vocab[word]-1] = 1
             onehot_encoded.append(temp)
  return onehot_encoded

print(get_onehot_vector(processed_docs[1]))

In [None]:
# Get one-hot representation for a random text, using the above vocabulary

get_onehot_vector("man and dog are good") 

In [None]:
# Another example

get_onehot_vector("man and man are good") 

## Bag of words

In [None]:
# Make count vectorizer with sci-kit learn

from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer()

#Build a BOW representation for the corpus
bow_rep = CV.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", sorted(CV.vocabulary_.items()))

In [None]:
#See the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

In [None]:
# See the BOW rep for all documents with pandas

df = pd.DataFrame(bow_rep.toarray(), columns = CV.get_feature_names_out())
df

In [None]:
# Get the representation using this vocabulary for a new text using transform

temp = CV.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", 

temp.toarray())

In [None]:
# Count vectorizer with binarizer

CVB = CountVectorizer(binary=True)
bow_rep_bin = CVB.fit_transform(processed_docs)
temp = CVB.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

In [None]:
# Bag of ngrams

#n-gram vectorization example with count vectorizer and uni, bi, trigrams
CVNG = CountVectorizer(ngram_range=(1,3))

#Build a BOW representation for the corpus
bow_rep = CVNG.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", CVNG.vocabulary_)

df = pd.DataFrame(bow_rep.toarray(), columns = CVNG.get_feature_names_out())
df

In [None]:
# Get the representation using this vocabulary for a new text

temp = CVNG.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

## TF-IDF

In [None]:
# Make tf-idf matrix

from sklearn.feature_extraction.text import TfidfVectorizer

TV = TfidfVectorizer()
bow_rep_tfidf = TV.fit_transform(processed_docs)

In [None]:
# Get the idf values

vocab = TV.get_feature_names_out()
idfs = TV.idf_

# Make dictionary

word_idfs = dict(zip(vocab, idfs))
pprint(word_idfs)

In [None]:
# Show tf-idf DTM

tv_df = pd.DataFrame(bow_rep_tfidf.toarray(), columns = TV.get_feature_names_out())
tv_df

## Vectorization with Latin example

In [None]:
sents = """Alienum est omne, quicquid optando evenit.
Ab alio expectes, alteri quod feceris.
Animus vereri qui scit, scit tuta ingredi.
Auxilia humilia firma consensus facit.
Amor animi arbitrio sumitur, non ponitur.
Aut amat aut odit mulier, nil est tertium.
Ad tristem partem strenua est suspicio.
Ames parentem, si aequus est: si aliter, feras.
Aspicere oportet, quidquid possis perdere.
Amici vitia si feras, facias tua.
Alienum aes homini ingenuo acerba est servitus.
Absentem laedit, cum ebrio qui litigat.
Amans iratus muta mentitur sibi.
Avarus ipse miseriae causa est suae.
Amans quid cupiat scit, quid sapiat non vidit.
Amans quod suspicatur, vigilans somniat.
Ad calamitatem quilibet rumor valet.
Amor extorqueri non pote, elabi pote.
Ab amante lacrimis redimas iracundiam.
Aperte mala cum est mulier, tum demum est bona.
Avarum facile capias, ubi non sis item.
Amare et sapere vix deo conceditur.
Avarus nisi cum moritur, nil recte facit.
Astus cinaedum celat, aetas indicat.
Avarus damno potius quam sapiens dolet.
Avaro quid mali optes nisi Vivat diu?
Animo dolenti nil oportet credere.
Aliena nobis, nostra plus aliis placent.
Amare iuveni fructus est, crimen seni.
Anus cum ludit, morti delicias facit.
Amoris vulnus idem, qui sanat, facit.
Ad paenitendum properat cito qui iudicat.
Aleator quanto in arte est melior, tanto est nequior.
Amor otiosae causa sollicitudinis.
Avidum esse oportet neminem, minime senem.
Animo virum pudicae, non oculo eligunt.
Amantium ira amoris integratio est.
Amantis ius iurandum poenam non habet.
Amans, sicut fax, agitando ardescit magis.
Amor, ut lacrima, ab oculo oritur, in pectus cadit.
Animo imperabit sapiens, stultus serviet.
Amicum an nomen habeas, aperit calamitas.
Amori finem tempus, non animus facit.""".split("\n")

print(sents[1])

In [None]:
# Preprocess the sentences

def preprocess(text):
    text = text.lower()
    import string
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('v', 'u').replace('j', 'i')
    text = text.strip()
    return text

sents = [preprocess(sent) for sent in sents]

print(sents[1])

In [None]:
# Make CV

CV = CountVectorizer()
dtm = CV.fit_transform(sents).toarray()
vocab = CV.get_feature_names_out()
cv_df = pd.DataFrame(dtm, columns = vocab)
cv_df


In [None]:
# Make tf-idf

TV = TfidfVectorizer()
dtm = TV.fit_transform(sents).toarray()
vocab = TV.get_feature_names_out()
tv_df = pd.DataFrame(dtm, columns = vocab)
tv_df


## Relevance ranking for Latin poem using line-based tf-idf dtm

In [None]:
# Get all lines in Lucretius' De Rerum Natura

from cltkreaders.lat import LatinTesseraeCorpusReader

CR = LatinTesseraeCorpusReader()

lucretius_files = CR.fileids(match='de_rerum_natura')
lucretius_lines = list(CR.lines(fileids=lucretius_files))
lucretius_cits = [line._.citation for line in lucretius_lines]

In [None]:
# Show CLTK Readers line citations

lucretius_cits[:5]

In [None]:
# Preprocess lines

lucretius_lines = [preprocess(line.text) for line in lucretius_lines]

In [None]:
lucretius_lines[:5]

In [None]:
# Create TFIDF matrix by line

TV = TfidfVectorizer()
dtm = TV.fit_transform(lucretius_lines)
dtm = dtm.todense()
vocab = TV.get_feature_names_out()
df = pd.DataFrame(dtm, index=lucretius_cits, columns=vocab)
df.iloc[:, :10]

In [None]:
# Create query vector; note `transform` method

import numpy as np

query = "hunc igitur terrorem animi tenebrasque necessest"
query = preprocess(query)
query_vec = TV.transform([query]).todense()
query_vec = np.array(query_vec).reshape(1, -1) # needs to be reshaped to match the shape of the df

In [None]:
# Get cosine similarity scores for every row against query vector, return 10 most similar

from sklearn.metrics.pairwise import cosine_similarity

df_array = df.to_numpy()
similarity_scores = cosine_similarity(query_vec, df_array)

# top_ten_indices = np.argsort(similarity_scores[0])[::-1][:10]
top_ten_indices = np.argsort(similarity_scores[0])[::-1][:15]

print('Here are the top ten most similar lines to the query by TFIDF:')

data = []
for idx in top_ten_indices:
    # print(f'{lucretius_cits[idx]}\t{lucretius_lines[idx]}\t{similarity_scores[0][idx]}')
    data.append([lucretius_cits[idx], lucretius_lines[idx], similarity_scores[0][idx]])

from tabulate import tabulate
print(tabulate(data, headers=['Citation', 'Text', 'Similarity Score']))
