In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Sample Code**
Convert words to vectors using Count Vectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog. "]
# create the transform
vectorizer = CountVectorizer()
#tokenize and build vocab
vectorizer.fit(text)
# summarize
print("vectorizer.vocabulary_ :",vectorizer.vocabulary_)
# encode document
vector1 = vectorizer.transform(text)
# summarize encoded vector
print("vector1.shape: ",vector1.shape)
print("vector1.toarray(): ", vector1.toarray())
print("vectorizer.vocabulary_: ", vectorizer.vocabulary_)

#Try another sentence
text2 = ["the the quick puppy"] # note that puppy is in the vector represenation.

#vectorizer.fit(text2)
vector2 = vectorizer.transform(text2)
print("vector2.toarray(): ",vector2.toarray())




vectorizer.vocabulary_ : {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
vector1.shape:  (1, 8)
vector1.toarray():  [[1 1 1 1 1 1 1 2]]
vectorizer.vocabulary_:  {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
vector2.toarray():  [[0 0 0 0 0 0 1 2]]


TFIDF Example

In [4]:
import pandas as pd
import numpy as np
corpus = ["The quick brown fox jumped over the lazy dog.",
    		"The dog jumped over the fox.",
		    "The fox"]
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)


Number of words in the corpus: 11
The words in the corpus: 
 {'dog.', 'dog', 'over', 'jumped', 'quick', 'the', 'brown', 'The', 'fox', 'fox.', 'lazy'}


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(corpus)
print(type(tf_idf_vector), tf_idf_vector.shape)



<class 'scipy.sparse._csr.csr_matrix'> (3, 8)


In [6]:
tf_idf_array = tf_idf_vector.toarray()
print("tf_idf_array:",tf_idf_array)
words_set = tr_idf_model.get_feature_names_out()
print("words_set:",words_set)

tf_idf_array: [[0.39285725 0.29877806 0.23202782 0.29877806 0.39285725 0.29877806
  0.39285725 0.46405564]
 [0.         0.40772355 0.31663371 0.40772355 0.         0.40772355
  0.         0.63326741]
 [0.         0.         0.70710678 0.         0.         0.
  0.         0.70710678]]
words_set: ['brown' 'dog' 'fox' 'jumped' 'lazy' 'over' 'quick' 'the']


In [7]:
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)
print(df_tf_idf)

      brown       dog       fox    jumped      lazy      over     quick  \
0  0.392857  0.298778  0.232028  0.298778  0.392857  0.298778  0.392857   
1  0.000000  0.407724  0.316634  0.407724  0.000000  0.407724  0.000000   
2  0.000000  0.000000  0.707107  0.000000  0.000000  0.000000  0.000000   

        the  
0  0.464056  
1  0.633267  
2  0.707107  


Distance metrices

In [8]:
from nltk.metrics import *
print("corpus[0]:",corpus[0])
print("corpus[1]:",corpus[1])
print("Edit Distnance same string: ",edit_distance(corpus[0],corpus[0]))
print("Edit Distnance: ",edit_distance(corpus[0],corpus[1]))
print("Binary Distnance: ",binary_distance(set(corpus[0]),set(corpus[1])))
print("Jaccard Distnance: ",jaccard_distance(set(corpus[0]),set(corpus[1])))
print("Masi Distnance: ",masi_distance(set(corpus[0]),set(corpus[1])))

# distances computed with nltk library functions
from sklearn.metrics.pairwise import euclidean_distances
print("Euclidean Distnance: ",euclidean_distances(vector1,vector2))

corpus[0]: The quick brown fox jumped over the lazy dog.
corpus[1]: The dog jumped over the fox.
Edit Distnance same string:  0
Edit Distnance:  21
Binary Distnance:  1.0
Jaccard Distnance:  0.39285714285714285
Masi Distnance:  0.5932142857142857
Euclidean Distnance:  [[2.44948974]]


In [9]:

import os

# Directory containing news articles
directory = "/content/drive/MyDrive/Colab Notebooks/TextMining/lab04-res"

# Initialize an empty list to store the contents of the articles
articles = []

# Iterate through each file in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        # Read the content of the file and append it to the list
        with open(filepath, 'r') as file:
            articles.append(file.read())

#3. vectorise them using the two types of vectorisers as given in
#the lecture code, They are :
#a. Count Vectoriser
#b. TFIDF Vectoriser

#Count Vectorizer
vectorizer.fit(articles)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector1 = vectorizer.transform(articles)
# summarize encoded vector
print(vector1.shape)
print(vector1.toarray())
print(vectorizer.vocabulary_)



np.sum(np.square(np.subtract(vector1.toarray()[0], vector1.toarray()[1])))



(4, 877)
[[0 0 1 ... 3 0 4]
 [0 1 0 ... 2 1 0]
 [0 1 0 ... 1 0 4]
 [1 0 1 ... 1 0 0]]


4710

In [10]:
#TFIDF
words_set = set()

for doc in  articles:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

print('Number of words in the articles:',len(words_set))
print('The words in the articles: \n', words_set)

from sklearn.feature_extraction.text import TfidfVectorizer
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(articles)
print(type(tf_idf_vector), tf_idf_vector.shape)

tf_idf_array = tf_idf_vector.toarray()
words_set = tr_idf_model.get_feature_names_out()
#print(words_set)

df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)
#print(df_tf_idf)
print(tf_idf_array)

#𝐷𝑖𝑠𝑡𝑎𝑛𝑐𝑒(𝑣1,𝑣2) = ∑𝑣𝑛,𝑛 = 0 (𝑣1 ― 𝑣2)2

np.sum(np.square(np.subtract(tf_idf_array[0], tf_idf_array[1])))

Number of words in the articles: 1069
The words in the articles: 
<class 'scipy.sparse._csr.csr_matrix'> (4, 877)
[[0.         0.         0.01964465 ... 0.03900778 0.         0.0785786 ]
 [0.         0.01382483 0.         ... 0.01830103 0.01753505 0.        ]
 [0.         0.01355811 0.         ... 0.00897397 0.         0.05423242]
 [0.03815386 0.         0.03008095 ... 0.01991027 0.         0.        ]]


1.2032868966947783

In [16]:


from sklearn.metrics.pairwise import euclidean_distances

print("vector1.shape: ",vector1.shape)
print("vector1.toarray(): ", vector1.toarray())
#print("vector1:",vector1)

print("vector1[0].shape: ",vector1[0].shape)
print("vector1[0].toarray(): ", vector1[0].toarray())
#print("vector1[0]:",vector1[0])

print("Euclidean Distnance: ",euclidean_distances(vector1[0],vector1[1]))

#===================================================

from nltk.metrics import *
corpus = articles
#print(corpus[0])
#print(corpus[1])
print("Edit Distnance same string: ",edit_distance(corpus[0],corpus[0]))
print("Edit Distnance: ",edit_distance(corpus[0],corpus[1]))
print("Binary Distnance: ",binary_distance(set(corpus[0]),set(corpus[1])))
print("Jaccard Distnance: ",jaccard_distance(set(corpus[0]),set(corpus[1])))
print("Masi Distnance: ",masi_distance(set(corpus[0]),set(corpus[1])))


vector1.shape:  (4, 877)
vector1.toarray():  [[0 0 1 ... 3 0 4]
 [0 1 0 ... 2 1 0]
 [0 1 0 ... 1 0 4]
 [1 0 1 ... 1 0 0]]
vector1[0].shape:  (1, 877)
vector1[0].toarray():  [[ 0  0  1  0  3  1  1  5  0  0  1  0  2  0  1  0  0  0  0  0  0  0  3  1
   1  1  0  0  1  0  0  1  0  2  2  0  4  0  0  0  0  0  0  1  0  0  0  1
   0  1  4  0  0 14  0  1  0  0  0  0  1  0  0  1  0  0  0  0  0  1  1  0
   3  0  0  0  8  0  1  0  4  0  1  0  0  0  0  1  0  0  1  1  4  0  0  1
   0  0  0  0  1  2  1  0  0  0  0  0  0  1  1  0  0  0  1  1 14  0  0  1
   0  0  0  2  1  0  2  3  0  0  1  0  0  0  0  1  0  0  0  0  0  0  0  0
   0  0  0  0  2  0  0  1  0  8  0  0  0  0  0  0  0  0  1  0  0  0  2  1
   0  2  0  2  0  0  0  0  1  0  0  1  0  1  0  0  0  7  0  0  0  0  0  0
   1  0  0  1  0  0  0  0  1  0  0  1  0  0  0  6  0  0  3  0  0  0  0  0
   0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  1  0  0  0  1  0  1  0  1
   0  0  2  0  0  1  0  0  1  1  0  0  1  1  1  0  0  0  0  0  0  0  0  0
   0  0  0  0