# IR Assignment

## Part 3 - Vectorization



### Step 1 - Word vectorization for each word in each document in each group of document

In [None]:
# get the cleaned data from the source

aj_word_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/A_J_word.csv?raw=true"
bbc_wrod_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/BBC_word.csv?raw=true"
jp_word_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/J_P_word.csv?raw=true"
nyt_word_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/NYT_word.csv?raw=true"

In [None]:
# ge the cleaned data from the source

aj_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma/A_J_lemma.csv?raw=true"
bbc_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma/BBC_lemma.csv?raw=true"
jp_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma/J_P_lemma.csv?raw=true"
nyt_lemma_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/lemma/NYT_lemma.csv?raw=true"

In [None]:
aj_word_file = "A_J_word.csv"
bbc_word_file = "BBC_word.csv"
jp_word_file = "J_P_word.csv"
nyt_word_file = "NYT_word.csv"

In [None]:
aj_lemma_file = "A_J_lemma.csv"
bbc_lemma_file = "BBC_lemma.csv"
jp_lemma_file = "J_P_lemma.csv"
nyt_lemma_file = "NYT_lemma.csv"

In [None]:
# we will do this only for one file to demonstrate and then add a loop for all the others

import requests

respose = requests.get(aj_word_link)
with open(aj_word_file, 'wb') as f:
    f.write(respose.content)
    print(f"Downloaded the file into {aj_word_file}")

Downloaded the file into A_J_word.csv


In [None]:
# read the file using pandas
import pandas as pd

# the df is id, document
df = pd.read_csv(aj_word_file)

df.head()

Unnamed: 0,id,document
0,aj_1,"pope renews call for gaza ceasefire , release ..."
1,aj_2,biden is still the best us president israel co...
2,aj_3,israeli air strikes continue across gaza as tr...
3,aj_4,police remove pro - palestinian students from ...
4,aj_5,mass graves found at southern gaza hospital ra...


In [None]:
# 1. tokenize the words
# 2.1 remove all stopword
# 2.2 remove all punctuation marks
# 3. use Word2Vec to create a vector for each word in the document (convert list to set to remove duplicate)


import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
  # tokenize the text
  tokens = word_tokenize(text)

  # tokens is a list of str remove duplicate so it's easier to work with
  # tokens =  list(set(tokens))

  # remove the stop words
  stop_words = stopwords.words('english')
  tokens = [word for word in tokens if word not in stop_words]

  # transform all the word into lower case
  tokens = [word.lower() for word in tokens]

  # remove the punctuation using re
  tokens = [word for word in tokens if re.match(r'[a-zA-Z]+', word)]

  return tokens


In [None]:
df['tokens'] = df['document'].apply(clean_text)
df['vocab'] = df['tokens'].apply(lambda x: len(set(x)))

In [None]:
df[['vocab','tokens']].head()

Unnamed: 0,vocab,tokens
0,31,"[pope, renews, call, gaza, ceasefire, release,..."
1,39,"[biden, still, best, us, president, israel, co..."
2,32,"[israeli, air, strikes, continue, across, gaza..."
3,32,"[police, remove, pro, palestinian, students, p..."
4,26,"[mass, graves, found, southern, gaza, hospital..."


In [None]:
from gensim.models import Word2Vec

# Train Word2Vec Model and create a list of vectors for each word for each document

model = Word2Vec(df['tokens'], min_count=1)

def vectorize_words(tokens):
  vectors = []
  for word in tokens:
    if word in model.wv:
      vectors.append(model.wv[word])
  return vectors

df['vectors'] = df['tokens'].apply(vectorize_words)

In [None]:
df['vectors'].head()

Unnamed: 0,vectors
0,"[[-0.0019281333, 0.008156626, -0.007068119, 0...."
1,"[[-0.14649631, 0.18167569, 0.056561675, 0.0363..."
2,"[[-0.2832598, 0.33291975, 0.08413278, 0.040921..."
3,"[[-0.13531245, 0.14811765, 0.03161882, 0.01202..."
4,"[[-0.06782641, 0.08897954, 0.018120956, 0.0039..."


In [None]:
# create a matrix per document with all the vectors as column

import numpy as np

def create_matrix(vectors):
  matrix = np.array(vectors)
  return matrix

df['matrix'] = df['vectors'].apply(create_matrix).tolist()

#### Saving the data for the assigment

In [None]:
# save the matrice into csv files

# for each matrix in the 'matrix' row save the matrix into a file called "{id}_word.csv"

for index, row in df.iterrows():
  # for simplification start with the first 2
  if index >= 2:
    break
  # remove above to save all
  matrix = row['matrix']
  id = row['id']
  np.savetxt(f"{id}_word.csv", matrix, delimiter=",")

### Step 2 - document matrix

Create a document vector from the matrix above

In [None]:
import numpy as np

def create_document_vector(matrix):
  # the matrix is a list of list of number
  np_matrix = np.array(matrix)
  return np.mean(np_matrix, axis=0)


df['document_vector'] = df['matrix'].apply(create_document_vector)

df['document_vector'].head()

Unnamed: 0,document_vector
0,"[-0.059707195, 0.07089727, 0.01880854, 0.01240..."
1,"[-0.0860565, 0.10473232, 0.02652022, 0.0155127..."
2,"[-0.09904994, 0.1139271, 0.030448232, 0.016716..."
3,"[-0.07787551, 0.0911199, 0.02394719, 0.0110787..."
4,"[-0.09571863, 0.10965918, 0.029226812, 0.01476..."


### Step 3 - Creating a document matrix with BERT

### Step 4 - Creating a Matrix from BERT-Sentence for the each document
(not each word then document, getting the document directly)

In [None]:
!pip install transformers==4.28.1

Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall

In [None]:
from transformers import BertTokenizer, BertModel

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_bert_embedding(text):
  inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
  outputs = model(**inputs)
  embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # Use the [CLS] token embedding
  return embeddings[0]

df['bert_embedding'] = df['document'].apply(get_bert_embedding)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyError: 'bert_document_martix'

In [None]:
df['bert_embedding'].head()

Unnamed: 0,bert_embedding
0,"[-1.098856, -0.15124485, 0.10436112, -0.410788..."
1,"[-0.6285494, -0.13604051, -0.7933989, -0.10326..."
2,"[-0.40547132, 0.05718469, -0.28402787, -0.5443..."
3,"[-0.50750023, -0.28645, -0.55936044, 0.1201859..."
4,"[-0.41018546, -0.04626697, -0.25421903, -0.107..."
