<a href="https://colab.research.google.com/github/d-tomas/text-mining/blob/main/notebooks/lecture_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Day 1**: Text Mining Basics

## Initial setup

In [None]:
# Import the required libraries

import spacy  # NLP library
import pandas as pd  # Table manipulation
import matplotlib.pyplot as plt  # Visualisation
import seaborn as sns  # Visualisation
import nltk  # NLP library
from nltk.corpus import wordnet  # WordNet

# Install the SpaCy model for English texts
spacy.cli.download('en_core_web_sm')

# Download WordNet
nltk.download('wordnet')

# Load the model
nlp = spacy.load('en_core_web_sm')

# Download example text file ('news.txt')
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/news.txt

## Example 1: part-of-speech tagging

In [None]:
# Process and annotate text with the SpaCy model

text = 'Today is Thursday, February 4, 2021. It is 3:00 p.m. I am attending a Text Mining seminar at the University of Alicante, in Spain. The teacher is David. He tries to make it interesting but sometimes fails.'
document = nlp(text)

In [None]:
# Extract the list of sentences from text

list(document.sents)

In [None]:
# Extract morphological information (POS-tagging) for each word in text

for token in document:  # For each token (word) in the document
    print('Word: ' + token.text)
    print('Lemma: ' + token.lemma_)
    print('POS: ' + token.pos_)
    print('POS fine: ' + token.tag_)
    print('---')

In [None]:
# You can use 'explain' if you do not understand the meaning of a POS tag

spacy.explain('VBZ')

In [None]:
# Create a DataFrame based on the content for further analysis

df = pd.DataFrame(data=[[token.text, token.lemma_, token.pos_, token.tag_] for token in document], columns=['Word', 'Lemma', 'POS', 'POS fine'])
df

In [None]:
# Basic statistics of the columns

df.describe()

In [None]:
# What is the number of verbs in the text?

(df['POS'] == 'VERB').sum()  # Substitute 'VERB' with any other POS tag (e.g. 'PUNCT')

In [None]:
# we can do some interesting visualisations
# Bar plot with the count of each POS tag

plt.figure(figsize=(14,7))
sns.countplot(x='POS', data=df)
plt.xticks(rotation=-45)  # Rotamos las etiquetas para que no se solapen
plt.show()

### Exercise

* Do the POS-tagging of the content in the file 'news.txt'

In [None]:
# Store all the content of the file 'news.txt' in the variable
with open('news.txt') as file:
    content = file.read()

# Insert your code below


* How many adjectives are there in the text?

In [None]:
# Insert your code below


## Example 2: shallow parsing

In [None]:
# Get all the noun phrases from text

for chunk in document.noun_chunks:
    print('Noun phrase: ' + chunk.text)

In [None]:
# 'displacy' shows the parse tree

spacy.displacy.render(document, style = 'dep', options = {'compact': True}, jupyter = True)

In [None]:
# Navigate the dependency tree
# - 'head' and 'child' describe words connected in the dependency tree
# - 'dep' is the type of syntactic relation connecting 'child' and 'head'

for token in document:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

### Exercise

Complete the code in the following cell to (shallow) parse the content in the file 'news.txt'.

In [None]:
# Insert your code below

## Example 3: semantic analysis with WordNet

In [None]:
# Get all the synsets of a word

word = 'dog'

list_synsets = wordnet.synsets(word)
for synset in list_synsets:
  print('Synset: ' + synset.name())
  print('Lemma: ' + synset.lemmas()[0].name())
  print('Meaning: ' + synset.definition())
  print('Examples: ' + str(synset.examples()))
  print('---')

In [None]:
# Get synonyms and antonyms

word = 'tall'

list_synsets = wordnet.synsets(word)
list_sinonyms = set()  # Use 'set' instead of 'list' to avoid duplicates
list_antonyms = set()
for synset in list_synsets:
  for lemma in synset.lemmas():
    list_sinonyms.add(lemma.name())
    if lemma.antonyms():
      list_antonyms.add(lemma.antonyms()[0].name())

print('Synonyms: ' + str(list_sinonyms))
print('Antonyms: ' + str(list_antonyms))

In [None]:
# Get all the hypernyms

word = 'terrier'

synset = wordnet.synsets(word)[0]  # First synset of the word
hypernyms = lambda s:s.hypernyms()

print(list(synset.closure(hypernyms)))