<a href="https://colab.research.google.com/github/d-tomas/text-mining/blob/main/notebooks/lecture_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Day 3**: Supervised Learning

## Initial setup

In [None]:
# Import the required libraries

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer  # Extract TF-IDF weighting schema
from sklearn.feature_extraction.text import CountVectorizer  # Extract TF weighting schema
from sklearn.metrics import accuracy_score  # Calculate the accuracy of the classifier
from sklearn.metrics import confusion_matrix  # Get the confusion matrix
from sklearn.model_selection import cross_val_score  # Cross-validation evaluation
from sklearn.model_selection import train_test_split  # Split the dataset into train and test
from sklearn.naive_bayes import MultinomialNB  # Naïve Bayes algorithm
from sklearn.neighbors import KNeighborsClassifier  # k-NN algorithm
from sklearn.neural_network import MLPClassifier  # Neural Networks algorithm
from sklearn.svm import SVC  # Support Vector Machines algorithm
from sklearn.tree import DecisionTreeClassifier  # Decission tree algorithm
import spacy

# Install the SpaCy model for English texts
spacy.cli.download('en_core_web_sm')

# Load the model (disable some functionalities not used in these exercises to save processing time)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'entity_likner', 'entity_ruler'])

# Download RePEC corpus
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/repec_s.csv
# Download cell phone opinions corpus
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/cell_phones.csv

## Example 1: text classification

In [None]:
# Show the first lines of the 'repec_s.csv' file

!head repec_s.csv

In [None]:
# Loading data from file into a Pandas DataFrame

data = pd.read_csv('repec_s.csv')
data

In [None]:
# Keep the contents of the abstract for classification

corpus = data['abstract']  # Store the abstracts
y = data['jel']  # Store the JEL category

In [None]:
# Plot the number of instances in each class

plt.figure(figsize=(10,8))
sns.countplot(x=y)
plt.show()

# F:	International Economics
# I:	Health, Education, and Welfare
# R:	Urban, Rural, Regional, Real Estate, and Transportation Economics
# M:	Business Administration and Business Economics | Marketing | Accounting | Personnel Economics

In [None]:
# Preprocessing
# For each abstract: remove punctuation, remove stopwords, and lowercase

def normalise(text):
  document = nlp(text)  # Process the text with SpaCy
  document = [token for token in document if not token.is_punct]  # Remove punctuation
  document = [token for token in document if not token.is_stop]  # Remove stopwords
  document = [token.lower_ for token in document]  # Lowercase
  return ' '.join(document)

corpus_normalised = corpus.map(normalise)
corpus_normalised

In [None]:
# Create a 'classify' function that performs the training and testing
# Input parameters:
# - corpus: the dataset containing the text for train and test
# - model_name: the name of the algorithm that we want to use ('DT', 'KNN', 'MLP', 'NB' or 'SVM')
# - evaluation_type: the type of evaluation, train/test split ('split') or cross-validation ('cv)
# The function returns the trained model and the vectorizer
# Both are required if we want to perform predictions in the future based on this model

def classify(corpus, model_name, evaluation_type):
  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform(corpus)

  if model_name == 'DT':
    model = DecisionTreeClassifier()  # Decission tree
  elif model_name == 'KNN':
    model = KNeighborsClassifier()  # k-NN
  elif model_name == 'MLP':  
    model = MLPClassifier()  # Neural network
  elif model_name == 'NB':
    model = MultinomialNB()  # Naïve Bayes
  else:
    model = SVC(kernel = 'linear')  # SVM

  # The user chooses to evaluate with train/test split
  if evaluation_type == 'split':
    # Split into training (80%) and test (20%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Prediction on the test set
    predictions = model.predict(X_test)

    # Calculate the accuracy of the algorithm
    print('Accuracy: {:.2%}\n'.format(accuracy_score(predictions, y_test)))
    print('Confusion matrix:')

    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, predictions), annot=True, linewidth=3)
    plt.yticks(rotation=0)
    plt.show()
  # The user chooses to evaluate with k-fold cross validation
  elif evaluation_type == 'cv':
    scores = cross_val_score(model, X, y, cv=5)  # 5-fold evaluation
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
  else:
    print('Unknown evaluation type')
  
  return model, vectorizer

In [None]:
# Possible values for model name: DT, KNN, MLP, NB, SVM
# Possible values for evaluation type: split, cv
# Returns the trained model and the vectorizer for further predictions

model_repec, vectorizer_repec = classify(corpus, 'SVM', 'split')

In [None]:
# Prediction for a new (never seen before) sample

new_input = ['The prize of bananas and its correlation with global warming']

new_input = vectorizer_repec.transform(new_input)  # Transform the new instance following the same procedure used when the model was created
label = model_repec.predict(new_input)  # Predict the label for the new instance (F, I, R or M)

if label == 'F':
  print('International Economics')
elif label == 'I':
  print('Health, Education, and Welfare')
elif label == 'R':
  print('Urban, Rural, Regional, Real Estate, and Transportation Economics')
elif label == 'M':
  print('Business Administration and Business Economics | Marketing | Accounting | Personnel Economics')
else:
  print('Unknown class')

### Exercise

In [None]:
# Test with titles instead of abstracts
# Tip: corpus = data['title']

# Try using different n-gram sizes
# Tip: vectorizer = TfidfVectorizer(ngram_range=(1,2))  # Uses 1-grams and 2-grams

# Try using TF weighting schema
# Tip: vectorizer = CountVectorizer()

## Example 2: sentiment analysis

In [None]:
# Check the first lines of the file containing cell phones opinions

!head cell_phones.csv

In [None]:
# Loading data from file

data = pd.read_csv('cell_phones.csv')
data

In [None]:
# Extract the comments and labels

corpus = data['content']  # Store the comments
y = data['opinion']  # Store positive or negative labels

In [None]:
# Plot the classes

plt.figure(figsize=(5,4))
sns.countplot(x=y)
plt.show()

# POS: positive opinion
# NEG: negative opinion

In [None]:
# Preprocessing
# Re-use the 'normalise' function

corpus_normalised = corpus.map(normalise)
corpus_normalised

In [None]:
# Use the 'classify' function as before
# Possible values for model name: DT, KNN, MLP, NB, SVM
# Possible values for evaluation type: split, cv

model_phones, vectorizer_phones = classify(corpus, 'SVM', 'split')

In [None]:
# Prediction

new_input = ['I love this phone!!']
new_input = vectorizer_phones.transform(new_input)
label = model_phones.predict(new_input)  # Predict the label for the new instance (POS o NEG)

if label == 'POS':
  print('Positive opinion')
elif label == 'NEG':
  print('Negative opinion')
else:
  print('Unknown class')

### Exercise

In [None]:
# Create a wordcloud of positive opinions
# Tip: data = data[data['opinion'] == 'POS']
# Tip: from wordcloud import WordCloud  # Required library

In [None]:
# Create a wordcloud of negative opinions

## Transformers

🤗 [Transformers](https://huggingface.co/transformers/) library provides general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with deep interoperability between Jax, PyTorch and TensorFlow.

There are more than 30,000 pre-trained [models](https://huggingface.co/models) and 2,000 [datasets](https://huggingface.co/datasets) available in their web page, covering tenths of different tasks in more than 100 languages.

This demo exemplifies the use of [pipelines](https://huggingface.co/transformers/main_classes/pipelines.html). These pipelines are objects that abstract most of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity Recognition, Masked Language Modeling, Sentiment Analysis, and Question Answering.

The following examples are inspired in the 🤗 Transformers library [course](https://huggingface.co/course/chapter1/3?fw=pt).

In [None]:
# Install the Transformers library

!pip install transformers[sentencepiece]

In [None]:
from transformers import pipeline  # Import Transformer models

### Sentiment analysis
Classify a sentence according to positive or negative sentiments.

In [None]:
# Load the sentiment analysis model ('distilbert-base-uncased-finetuned-sst-2-english' by default)

model = pipeline('sentiment-analysis')

In [None]:
# Try it!

model('This is the best course I have ever attended in my life. Praise to David!')

### Zero-shot classification
Classify text according to a set of given labels.

In [None]:
# Load the zero-shot classification model ('facebook/bart-large-mnli' by default)

model = pipeline('zero-shot-classification')

In [None]:
# Try it!

model('This lecture is about Natural Language Processing', candidate_labels=['education', 'politics', 'business', 'sports'])

### Text generation
Predict the words that will follow a specified text prompt, creating a coherent portion of text that is a continuation from the given context.

In [None]:
# Load the text generation model ('gpt2' by default)

model = pipeline('text-generation')

In [None]:
# Try it! (you will get a different output each time)

model('I opened the door and found')

In [None]:
# Tyr it tuning some parameters (maximum length generated and number of returned sentences)!

model('The book was amazing', max_length=40, num_return_sequences=3)

### Masked language modelling
Mask a token in a sequence with a masking token, and prompt the model to fill that mask with an appropriate token.

In [None]:
# Load the masked language modelling model ('distilroberta-base' by default)

model = pipeline('fill-mask')

In [None]:
# Try it (returning the 'top_k' words)!

model('I <mask> this lecture.', top_k=5)

### Named entity recognition
Classify tokens according to a class (e.g. person, organisation or location).

In [None]:
# Load the named entity recognition model ('dbmdz/bert-large-cased-finetuned-conll03-english' by default)

model = pipeline('ner', grouped_entities=True)

In [None]:
# Try it!

model('My name is David and I live in Spain.')

### Question answering
Extract an answer from a text given a question.

In [None]:
# Load the question answering model ('distilbert-base-cased-distilled-squad' by default)

model = pipeline('question-answering')

In [None]:
# Try it!

model(question='Where do I work?', context='My name is David and I work really hard at the Unviersity of Alicante')

### Machine translation
Translate from one language to another.

In [None]:
# Load the machine translation model from ES to EN ('Helsinki-NLP/opus-mt-es-en')
# Try different models changing 'Helsinki-NLP/opus-mt-{src}-{tgt}' (src = source language, tgt = target)

model = pipeline('translation', model='Helsinki-NLP/opus-mt-es-en')

In [None]:
# Try it!

model('Ojalá el próximo año pueda ir a Alicante')

# References

* [RePEC](http://www.repec.org/)
* [JEL Classification System](https://www.aeaweb.org/econlit/jelCodes.php?view=jel)
* [Hugging Face](https://huggingface.co/)
