<a href="https://colab.research.google.com/github/d-tomas/text-mining/blob/main/notebooks/lecture_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Day 2**: Document Representation

## Initial setup

In [None]:
# Import the required libraries

import nltk  # NLP library
from nltk.stem.porter import *  # Stemmer tool
from sklearn.feature_extraction.text import CountVectorizer  # Term by document matrix with TF
from sklearn.feature_extraction.text import TfidfVectorizer  # Term by document matrix with TF-IDF
import spacy  # NLP library

# Install the SpaCy model for English texts
spacy.cli.download('en_core_web_sm')

# Load the model
nlp = spacy.load('en_core_web_sm')

# Download example text files ('news.txt' and 'paper.txt')
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/news.txt
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/paper.txt

## Example 1: n-gram extraction


In [None]:
# Extract bigrams and trigrams from text

with open('news.txt') as file:
    content = file.read()

list_bigrams = nltk.ngrams(content.split(), 2)  # split() the sentence into a list of words
list_trigrams = nltk.ngrams(content.split(), 3)

print('---------')
print('Bigrams:')
print('---------')
for bigram in list_bigrams:
  print(bigram)

print('----------')
print('Trigrams:')
print('----------')
for trigram in list_trigrams:
  print(trigram)

In [None]:
# The previous approach does not consider sentence boundaries
# We can read the file line by line and extract n-grams for each line separately

with open('news.txt') as file:
    content = file.readlines()  # Get a list of lines

# Remove empty lines, blanks and new line characters
content = [line.strip() for line in content if line.strip()]

for line in content:
    trigrams = nltk.ngrams(line.split(), 3)  # Extract 3-grams for each line
    for trigram in trigrams:
        print(trigram)

### Exercise

In [None]:
# Repeat the analysis on 'paper.txt', obtaining also 4-grams and 5-grams in addition to bigrams and trigrams
# Use the first procedure (no need to consider sentence boundaries)


## Example 2: normalisation / pre-processing

In [None]:
# Remove punctuation, lowercase, remove stopwords and get the stem of the words

text = 'The Netherlands earned sweet revenge on Spain on Friday at the Fonte Nova in Salvador, hammering Spain 5-1 to put an emphatic coda on their loss in the 2010 World Cup finals.'

document = nlp(text)  # Process the text with SpaCy

document = [token for token in document if not token.is_punct]  # Remove punctuation
print('No punctuation: ' + str(document))

document = [token for token in document if not token.is_stop]  # Remove stopwords
print('No stopwords: ' + str(document))

document = [token.lower_ for token in document]  # Lowercase
print('Lowercased: ' + str(document))

stemmer = PorterStemmer()
document = [stemmer.stem(token) for token in document]  # Stem of the words
print('Stems: ' + str(document))

### Exercise

In [None]:
# Repeat the previous analysis on the content of 'paper.txt'


## Example 3: weighting schema

In [None]:
# Build the term by document matrix using the TF weighting schema

corpus = ['I do not like this restaurant', 'I like this restaurant very much', 'I think it is a very very bad place', 'I love this place']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())
print(X.shape)

vectorizer2 = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))  # Extract bigrams
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())
print(X2.shape)

In [None]:
# Build the term by document matrix using the TF-IDF weighting schema

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

### Exercise

In [None]:
# Get the term by document matrix, using TF weighting schema and trigrams on 'news.txt'