# IR Assignment

## Part 3 - Vectorization



### Step 1 - Word vectorization for each word in each document in each group of document

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
# get the cleaned data from the source

aj_word_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word1/A_J_word.csv?raw=true"
bbc_wrod_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word1/BBC_word.csv?raw=true"
jp_word_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word1/J_P_word.csv?raw=true"
nyt_word_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word1/NYT_word.csv?raw=true"

In [3]:
# ge the cleaned data from the source

aj_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma1/A_J_lemma.csv?raw=true"
bbc_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma1/BBC_lemma.csv?raw=true"
jp_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma1/J_P_lemma.csv?raw=true"
nyt_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma1/NYT_lemma.csv?raw=true"

In [4]:
aj_word_file = "A_J_word.csv"
bbc_word_file = "BBC_word.csv"
jp_word_file = "J_P_word.csv"
nyt_word_file = "NYT_word.csv"

In [5]:
aj_lemma_file = "A_J_lemma.csv"
bbc_lemma_file = "BBC_lemma.csv"
jp_lemma_file = "J_P_lemma.csv"
nyt_lemma_file = "NYT_lemma.csv"

In [6]:
# we will do this only for one file to demonstrate and then add a loop for all the others

import requests

respose = requests.get(aj_word_link)
with open(aj_word_file, 'wb') as f:
    f.write(respose.content)
    print(f"Downloaded the file into {aj_word_file}")

Downloaded the file into A_J_word.csv


In [7]:
# read the file using pandas
import pandas as pd

# the df is id, document
df = pd.read_csv(aj_word_file)

df.head()

Unnamed: 0,id,document
0,aj_1,"pope renews call for gaza ceasefire , release ..."
1,aj_2,biden is still the best us president israel co...
2,aj_3,israeli air strikes continue across gaza as tr...
3,aj_4,police remove pro - palestinian students from ...
4,aj_5,mass graves found at southern gaza hospital ra...


In [8]:
# 1. tokenize the words
# 2.1 remove all stopword
# 2.2 remove all punctuation marks
# 3. use Word2Vec to create a vector for each word in the document (convert list to set to remove duplicate)


import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
  # tokenize the text
  tokens = word_tokenize(text)

  # tokens is a list of str remove duplicate so it's easier to work with
  # tokens =  list(set(tokens))

  # remove the stop words
  stop_words = stopwords.words('english')
  tokens = [word for word in tokens if word not in stop_words]

  # transform all the word into lower case
  tokens = [word.lower() for word in tokens]

  # remove the punctuation using re
  tokens = [word for word in tokens if re.match(r'[a-zA-Z]+', word)]

  return tokens


In [10]:
df['tokens'] = df['document'].apply(clean_text)
df['vocab'] = df['tokens'].apply(lambda x: len(set(x)))

In [11]:
df[['vocab','tokens']].head()

Unnamed: 0,vocab,tokens
0,31,"[pope, renews, call, gaza, ceasefire, release,..."
1,39,"[biden, still, best, us, president, israel, co..."
2,32,"[israeli, air, strikes, continue, across, gaza..."
3,32,"[police, remove, pro, palestinian, students, p..."
4,26,"[mass, graves, found, southern, gaza, hospital..."


In [12]:
from gensim.models import Word2Vec

# Train Word2Vec Model and create a list of vectors for each word for each document

model = Word2Vec(df['tokens'], min_count=1)

def vectorize_words(tokens):
  vectors = []
  for word in tokens:
    if word in model.wv:
      vectors.append(model.wv[word])
  return vectors

df['vectors'] = df['tokens'].apply(vectorize_words)

In [13]:
df['vectors'].head()

Unnamed: 0,vectors
0,"[[-0.008739821, 0.009628636, -0.005591511, 0.0..."
1,"[[-0.19034554, 0.24129803, 0.016133577, 0.0250..."
2,"[[-0.35843077, 0.43929887, 0.008117397, 0.0197..."
3,"[[-0.15680563, 0.20990284, -0.0010627547, 0.00..."
4,"[[-0.07560836, 0.09097688, 0.010101626, 0.0046..."


In [14]:
# create a matrix per document with all the vectors as column

import numpy as np

def create_matrix(vectors):
  matrix = np.array(vectors)
  return matrix

df['matrix'] = df['vectors'].apply(create_matrix).tolist()

#### Saving the data for the assigment

Getting the original data.

In [15]:
# save the matrice into csv files

# for each matrix in the 'matrix' row save the matrix into a file called "{id}_word.csv"

for index, row in df.iterrows():
  # for simplification start with the first 2
  if index >= 2:
    break
  # remove above to save all
  matrix = row['matrix']
  id = row['id']
  np.savetxt(f"{id}_word.csv", matrix, delimiter=",")

In [16]:
# cleaning the data
!rm data.xlsx

In [17]:
# downloding the data
import requests
import os
import pandas as pd

url = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/data.xlsx?raw=true"
output_filename = "data.xlsx"

response = requests.get(url)
if response.status_code == 200:
    with open(output_filename, "wb") as file:
        file.write(response.content)

print(f"The file was downloaded, and it is in {output_filename}.")

The file was downloaded, and it is in data.xlsx.


In [18]:
data_aj = pd.read_excel(output_filename, sheet_name="A-J", engine="openpyxl")
data_bbc = pd.read_excel(output_filename, sheet_name="BBC", engine="openpyxl")
data_jp = pd.read_excel(output_filename, sheet_name="J-P", engine="openpyxl")
data_nyt = pd.read_excel(output_filename, sheet_name="NY-T", engine="openpyxl")

In [19]:
# processing the data for aj

col_names_aj = ['title', 'sub_title', 'Body Text']
# we will add all the text from the 3 column above (is nan replace by "")
# we will add a column 'id' that will be aj_<i> where i is the index of the row

data_aj = data_aj[col_names_aj]
data_aj = data_aj.fillna("")

df_aj = pd.DataFrame()
df_aj["id"] = range(1, len(data_aj) + 1)
df_aj["id"] = "aj_" + df_aj["id"].astype(str)
df_aj["document"] = data_aj["title"] + " " + data_aj["sub_title"] + " " + data_aj["Body Text"]

print(df_aj.head())

     id                                           document
0  aj_1  pope renews call for gaza ceasefire, release o...
1  aj_2  biden is still the best us president israel co...
2  aj_3  israeli air strikes continue across gaza as tr...
3  aj_4  police remove pro-palestinian students from pa...
4  aj_5  mass graves found at southern gaza hospital ra...


### Step 2 - document matrix

Create a documant matrix on the original data.

In [20]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download punkt tokenizer if not already downloaded

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    # You can add further cleaning steps here if needed
    return tokens

df_aj['tokens_uncleaned'] = df_aj['document'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents_uncleaned = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_aj['tokens_uncleaned'])]

model_uncleaned = Doc2Vec(documents_uncleaned, vector_size=100, window=5, min_count=1, workers=4)

In [22]:
df_aj['doc_vectors_uncleaned'] = [model_uncleaned.infer_vector(doc) for doc in df_aj['tokens_uncleaned']]

# print an example
print(df_aj['doc_vectors_uncleaned'][0])

[-0.00164655  0.07490247  0.01553972 -0.01797744 -0.01613013 -0.2737807
 -0.00511761  0.25950846 -0.12556376 -0.12204838 -0.07877547 -0.181569
 -0.04144086  0.07513987  0.02862035 -0.19424981  0.05662106 -0.16720426
  0.00911274 -0.25743002  0.12560393  0.00727799  0.10375846 -0.02296139
 -0.1473076   0.02431521 -0.06731863 -0.03891238 -0.14956291  0.03659498
  0.10817225  0.02003925  0.0503824  -0.04036291 -0.10304324  0.18932426
  0.05482943 -0.09025428 -0.03135174 -0.17667867  0.0914945  -0.11581706
 -0.01447068  0.05015148  0.08557668 -0.05160709 -0.04617869  0.00958086
  0.04601027  0.09251373  0.03226227 -0.09263323  0.02180664 -0.11239918
 -0.11834051  0.16201371  0.06074248  0.03502878 -0.13382986  0.08883511
  0.03095963  0.04076466  0.0569186   0.09473632 -0.08309249  0.16031575
 -0.02258376  0.13112049 -0.1545721   0.11365636  0.00133875  0.0502634
  0.21939053 -0.02252873  0.19234072  0.10084786  0.12398187 -0.04543443
 -0.12012256  0.00660647 -0.08411212  0.06030925 -0.198

### Step 3 - Creating a document matrix with BERT

The third step is to use BERT to get the document vector on the uncleaned data.

In [23]:
!pip install transformers==4.31.0
!pip install -U sentence-transformers

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m112.6/116.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [24]:
from transformers import BertTokenizer, BertModel
import torch

In [32]:
def get_bert_embedding(text):
  model_name = 'bert-base-uncased'  # Choose your desired BERT model
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model = BertModel.from_pretrained(model_name)
  inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
  outputs = model(**inputs)
  # Use the last hidden state as the document embedding
  embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
  return embeddings[0]

df_aj['bert_embeddings_uncleaned'] = df_aj['document'].apply(get_bert_embedding)

In [33]:
# # print an example vector with the original data

df_aj[['id', 'document','bert_embeddings_uncleaned']].head()

Unnamed: 0,id,document,bert_embeddings_uncleaned
0,aj_1,"pope renews call for gaza ceasefire, release o...","[-1.1045685, -0.09817083, 0.12469145, -0.28839..."
1,aj_2,biden is still the best us president israel co...,"[-0.5347208, 0.0031360134, -0.760571, -0.10111..."
2,aj_3,israeli air strikes continue across gaza as tr...,"[-0.405472, 0.05718441, -0.28402874, -0.544374..."
3,aj_4,police remove pro-palestinian students from pa...,"[-0.52913475, -0.26546007, -0.528176, 0.095606..."
4,aj_5,mass graves found at southern gaza hospital ra...,"[-0.37001178, -0.03767378, -0.24396741, -0.144..."


### Step 4 - Creating a Matrix from BERT-Sentence for the each document
(not each word then document, getting the document directly)

In [28]:
from sentence_transformers import SentenceTransformer

In [29]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert documents to a list
documents = df_aj['document'].tolist()

# Generate embeddings
embeddings = model.encode(documents, show_progress_bar=True)


# Add embeddings as a new column
df_aj['embeddings'] = list(embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

In [30]:
# print the first few number of vector

df_aj[['id', 'document','embeddings']].head()

Unnamed: 0,id,document,embeddings
0,aj_1,"pope renews call for gaza ceasefire, release o...","[0.015109662, 0.09104693, 0.056800608, 0.00413..."
1,aj_2,biden is still the best us president israel co...,"[0.0175719, 0.09915805, 0.038959358, -0.085466..."
2,aj_3,israeli air strikes continue across gaza as tr...,"[0.07414572, 0.025160894, 0.06709119, -0.01467..."
3,aj_4,police remove pro-palestinian students from pa...,"[0.046261847, 0.12598646, 0.023747936, -0.0532..."
4,aj_5,mass graves found at southern gaza hospital ra...,"[0.002999045, 0.16610354, -0.020498874, -0.056..."
