# Tokenisation and basic feature vectors

1. Install libraries

In [None]:
! git clone https://github.com/surrey-nlp/NLP-2023.git

In [None]:
%pip install numpy
%pip install pandas
%pip install sklearn
%pip install nltk

In [2]:
import numpy as np
import pandas as pd
import sklearn
import nltk

In [3]:
sentence = """ Welcome to the Natural Language Processing lab!
               We'll learn many things in this no 1 lab, so we will take it easy.
               Natural Language Processing is fun."""

In [4]:
# Here is split the string into the list
word_tokens = sentence.split()

# Give the unique tokens and sort it follows on a-z
vocabs = sorted(set(word_tokens))

# Some information you can see in here before sort and set and after that

len_word_tokens = len(word_tokens)
len_vocab = len(vocabs)

len_word_tokens, len_vocab

matrix = np.zeros((len_word_tokens, len_vocab), int)
for i, token in enumerate(word_tokens):
    matrix[i, vocabs.index(token)] = 1

print(word_tokens)

['Welcome', 'to', 'the', 'Natural', 'Language', 'Processing', 'lab!', "We'll", 'learn', 'many', 'things', 'in', 'this', 'no', '1', 'lab,', 'so', 'we', 'will', 'take', 'it', 'easy.', 'Natural', 'Language', 'Processing', 'is', 'fun.']


In [None]:
# Instead of doing the all code above, we can do by using pandas library
matrix = pd.get_dummies(word_tokens)

In [None]:
# From bag of words, we can put it into a vector likes that
df_bow = pd.DataFrame(pd.Series(dict([(token, 1) for token in word_tokens]))).T
df_bow


Unnamed: 0,Welcome,to,the,Natural,Language,Processing,lab!,We'll,learn,many,...,1,"lab,",so,we,will,take,it,easy.,is,fun.
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
# Suppose that we have a list of 3 elements, and a bag of words above, we should do like below for seeing the corpus
sentence_row_list = sentence.split('\n')
sentence_row_list_token = []
[sentence_row_list_token.append(i.split()) for i in sentence_row_list]

corpus_dict = {}

for i in range(len(sentence_row_list_token)):
  corpus_dict[f"{i}"] = dict((token, 1) for token in sentence_row_list_token[i])

# corpus_dict[{i}] = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence_row_list_token[i]]))).T
df = pd.DataFrame.from_records(corpus_dict).T
df.T

Unnamed: 0,0,1,2
Welcome,1.0,,
to,1.0,,
the,1.0,,
Natural,1.0,,1.0
Language,1.0,,1.0
Processing,1.0,,1.0
lab!,1.0,,
We'll,,1.0,
learn,,1.0,
many,,1.0,


In [None]:
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer

ngrams_2 = list(ngrams(word_tokens, 2))
print(ngrams_2[1][1])

nltk.download("wordnet")
nltk.download("omw-1.4")
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better", "a"))

the
better
good


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
data_matrix = df.fillna(0).astype(int).to_numpy()
data_matrix
from scipy.spatial.distance import euclidean
dist = euclidean(data_matrix[0], data_matrix[1])
print(dist)

4.69041575982343
