<a href="https://colab.research.google.com/github/d-tomas/text-mining/blob/main/notebooks/lecture_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Day 3**: Supervised Learning

## Initial setup

In [None]:
# Import the required libraries

import matplotlib.pyplot as plt  # Para hacer gráficas
import numpy as np  # Obtener valores únicos en un vector
import pandas as pd
import seaborn as sns  # Visualización del mapa de calor
from sklearn.metrics import accuracy_score  # Calcular la precisión del clasificador
from sklearn.model_selection import train_test_split  # Separar el dataset en entrenamiento y test
from sklearn.metrics import confusion_matrix  # Sacar la matriz de confusión
from sklearn.metrics import mean_absolute_error  # Mean Absolut Error (MAE) para regresión
from sklearn.svm import SVC  # Algoritmo Support Vector Machines
from sklearn.tree import DecisionTreeClassifier  # Decission tree algorithm
from sklearn.naive_bayes import MultinomialNB  # Naïve Bayes
from sklearn.neural_network import MLPClassifier  # Neural Networks
from sklearn.neighbors import KNeighborsClassifier  # k-NN
from sklearn.feature_extraction.text import TfidfVectorizer  # Matriz de términos por documento con TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score  # Cross-validation evaluation
import spacy  # NLP library

# Install the SpaCy model for English texts
spacy.cli.download('en_core_web_sm')

# Load the model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'entity_likner', 'entity_ruler'])

# Descargamos el corpus para entrenar y evaluar el sistema de regresión
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/repec_s.csv
# Descargamos el corpus para entrenar y evaluar el sistema de clasificación
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/cell_phones.csv

## Example 1: text classification

In [None]:
# Vemos que pinta tiene el corpus de entrenamiento

!head repec_s.csv

In [None]:
# Loading data from file

data = pd.read_csv('repec_s.csv')  # Cargamos los datos del fichero
data

In [None]:
# Classify based on the abstracts

corpus = data['abstract']  # Store the abstracts
y = data['jel']  # Store the JEL category

In [None]:
# Plot the classes

plt.figure(figsize=(10,8))
sns.countplot(x=y)
plt.show()

# F:	International Economics
# I:	Health, Education, and Welfare
# R:	Urban, Rural, Regional, Real Estate, and Transportation Economics
# M:	Business Administration and Business Economics | Marketing | Accounting | Personnel Economics

In [None]:
# Preprocessing
# For each abstract: remove punctuation, remove stopwords, and lowercase
#corpus_normalised = list(nlp.pipe(corpus.values, disable=['parser', 'ner', 'entity_likner', 'entity_ruler']))

def normalise(text):
  document = nlp(text)  # Process the text with SpaCy
  document = [token for token in document if not token.is_punct]  # Remove punctuation
  document = [token for token in document if not token.is_stop]  # Remove stopwords
  document = [token.lower_ for token in document]  # Lowercase
  return ' '.join(document)

corpus_normalised = corpus.map(normalise)
corpus_normalised

In [None]:
# Tenemos que transformar las palabras en números
# Cada palabra del mensaje se representa por su TF-IDF

def classify(corpus, model_name, evaluation_type):
  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform(corpus)

  if model_name == 'DT':
    model = DecisionTreeClassifier()  # Decission tree
  elif model_name == 'KNN':
    model = KNeighborsClassifier()  # k-NN
  elif model_name == 'MLP':  
    model = MLPClassifier()  # Neural network
  elif model_name == 'NB':
    model = MultinomialNB()  # Naïve Bayes
  else:
    model = SVC(kernel = 'linear')  # SVM

  if evaluation_type == 'split':
    # Separamos el corpus en entrenamiento (80%) y test (20%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Prediction over the test set
    predictions = model.predict(X_test)

    # Calculate the accuracy of the algorithm
    print('Accuracy: {:.2%}\n'.format(accuracy_score(predictions, y_test)))
    print('Confusion matrix:')

    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, predictions), annot=True, linewidth=3)
    plt.yticks(rotation=0)
    plt.show()
  elif evaluation_type == 'cv':
    scores = cross_val_score(model, X, y, cv=5)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
  else:
    print('Unknown evaluation type')
  
  return model, vectorizer

In [None]:
# Possible values for model name: DT, KNN, MLP, NB, SVM
# Possible values for evaluation type: split, cv
# Returns the trained model, for further predictions

model_repec, vectorizer_repec = classify(corpus, 'SVM', 'split')

In [None]:
# Prediction

new_input = ['My car is a Ferrari']
# Tenemos que transformar el texto a números, como se hizo al entrenar
new_input = vectorizer_repec.transform(new_input)
label = model_repec.predict(new_input)  # Predecimos la etiqueta para la nueva entrada (POS o NEG)

if label == 'F':
  print('International Economics')
elif label == 'I':
  print('Health, Education, and Welfare')
elif label == 'R':
  print('Urban, Rural, Regional, Real Estate, and Transportation Economics')
elif label == 'M':
  print('Business Administration and Business Economics | Marketing | Accounting | Personnel Economics')
else:
  print('Unknown class')

### Exercise

In [None]:
# Test with titles instead of abstracts
# Tip: corpus = data['title']

# Try using different n-gram sizes
# Tip: vectorizer = TfidfVectorizer(ngram_range=(1,2))  # Uses 1-grams and 2-grams

# Try using TF weighting schema
# Tip: vectorizer = CountVectorizer()

## Example 2: sentiment analysis

In [None]:
# Vemos que pinta tiene el corpus de entrenamiento

!head cell_phones.csv

In [None]:
# Loading data from file

data = pd.read_csv('cell_phones.csv')  # Cargamos los datos del fichero
data

In [None]:
# Extract the comments and labels

corpus = data['content']  # Store the comments
y = data['opinion']  # Store positive or negative labels

In [None]:
# Plot the classes

plt.figure(figsize=(5,4))
sns.countplot(x=y)
plt.show()

# POS: positive opinion
# NEG: negative opinion

In [None]:
# Preprocessing
# Re-use the 'normalise' function

corpus_normalised = corpus.map(normalise)
corpus_normalised

In [None]:
# Use the 'classify' function as before
# Possible values for model name: DT, KNN, MLP, NB, SVM
# Possible values for evaluation type: split, cv

model_phones, vectorizer_phones = classify(corpus, 'SVM', 'split')

In [None]:
# Prediction

new_input = ['I love this phone!!']
# Tenemos que transformar el texto a números, como se hizo al entrenar
new_input = vectorizer_phones.transform(new_input)
label = model_phones.predict(new_input)  # Predecimos la etiqueta para la nueva entrada (POS o NEG)

if label == 'POS':
  print('Positive opinion')
elif label == 'NEG':
  print('Negative opinion')
else:
  print('Unknown class')

### Exercise

In [None]:
# Create a wordcloud of positive opinions
# Tip: data = data[data['opinion'] == 'POS']
# Tip: from wordcloud import WordCloud  # Required library

In [None]:
# Create a wordcloud of negative opinions

# References

* [RePEC](http://www.repec.org/)
* [JEL Classification System](https://www.aeaweb.org/econlit/jelCodes.php?view=jel)
