In [1]:
# In this tutorial, we will see how it is possible to extract meaningful information from text data
# To such an extent, we are going to exploit the Python package "spaCy": https://spacy.io/

# We are first going to provide a brief introduction to natural language processing (NLP) with spaCy
# This includes some basic operations for cleaning and analyzing text data 
# Finally, we will practically deal with text classification, using some real-world data 

# Tutorial adapted from: https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

In [2]:
# spaCy is an open-source natural language processing library for Python
# It is designed particularly for production use, and it can help us to build applications that process massive volumes of text efficiently
# Thus, it is an "industry-level" framework for NLP

# First of all, let us install such a useful library

In [None]:
!pip uninstall spacy --yes
!pip install spacy==2.1.0

In [None]:
!pip uninstall neuralcoref --yes
!pip install neuralcoref

In [None]:
# Let us start our text analytics task with tokenization, which involves breaking the text into pieces, called tokens. For now, let us just ignore n-grams.


# Define an exemplary text that we are going to process
text = "One morning I shot an elephant in my pajamas. How he got into my pajamas I'll never know  ."

import spacy 

# Load English text processing pipeline
# In spaCy, all tasks are handled by means of pipelines (models)
# A pipeline takes an input text and applies, in sequence, several functions to it, typically intended (and trained according to different methodologies) for a specific language
# It is possible for a user to define a custom pipeline, or to rely on predefined ones, as we do here with "en"
# In any case, also default pipelines can be customized

# The following instruction downloads one of such default models

!python -m spacy download en


In [None]:
# Instantiating the model
nlp = spacy.load('en')

print("This are the components of the nlp model:", nlp.pipe_names)

# As we shall see, "nlp" converts the text into a list of token objects, each with a set of properties
my_doc = nlp(text)

# Let us retrieve the list of generated tokens from my_doc
token_list = []
for token in my_doc:
  token_list.append(token.text) # note that the token is not a string, but a specific object used within spaCy
print(token_list)

# The tokens however still contain whitespace punctuation characters...

In [None]:
# Let us retrieve again the list of generated tokens from my_doc, but this time excluding punctuation and whitespace characters
token_list = []
for token in my_doc:
  if not(token.is_punct or token.is_space):
    token_list.append(token.text)
print(token_list)

In [None]:
# Now, we want to remove stopwords, i.e., those very frequent words that are not very useful for analysis purposes
# spaCy already includes a list of stopwords that we can inspect

#importing stop words from English language.
from spacy.lang.en.stop_words import STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(STOP_WORDS))

#Printing first ten stop words:
print('First ten stop words: %s' % list(STOP_WORDS)[:20])

In [None]:
# Now let us include the step that filters the stopwords from our example text

token_list = []
for token in my_doc:
  if not(token.is_punct or token.is_space or token.is_stop):
    token_list.append(token)
print(token_list)

In [None]:
# Then we come to the text normalization part
# Here, we want to transform each word in its root form (lemma)
# This can be done by the stemming process, which was already carried out by the "nlp" function

# Let us consider tokens in their root form
token_list = []
for token in my_doc:
  if not(token.is_punct or token.is_space or token.is_stop):
    token_list.append(token.lemma_)
print(token_list)

In [None]:
# POS tagging
# Part-Of-Speech (POS) tagging involves associating to each token its grammar role within the sentence
# This may be useful to perform some kinds of text analytics tasks
# Again, also POS has been already carried out by our very useful "nlp" function


# Let us associate POS information to each token
token_list = []
for token in my_doc:
  if not(token.is_punct or token.is_space or token.is_stop):
    token_list.append((token, token.lemma_, token.pos_))
print(token_list)

In [None]:
# Another cool thing regarding spaCy is that it also performs NER
# NER stands for Named Entity Recognition
# NER is a more advanced form of language processing that identifies important elements in the text
# like places, people, organizations, and so on
# This is really helpful for quickly extracting information from text, since you can quickly pick out 
# important topics or indentify key sections of text

# We will change our example text now

nytimes = nlp("New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases. At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday. The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
display(entities)

In [None]:
# For visualization purposes we can also importing the next package
from spacy import displacy

displacy.render(nytimes, style = "ent", jupyter = True)

In [None]:
# Dependency Parsing
# Depenency parsing is a language processing technique that allows us to better determine 
# the meaning of a sentence by analyzing how it’s constructed
# Specifically, we want to determine how the individual words relate to each other.

# Let us change the exemplary text again
analyzed_text = nlp("I must confess, I was born at a very early age.")

displacy.render(analyzed_text, style="dep", jupyter= True)


In [None]:
# Sometimes we have that the same subject or object is referred to as in the text multiple times in different ways
# This can be the case, for instance, of pronouns
# The task by which we recognize all occurrences of a same entity in the text is called Coreference Resolution

# Such a task is not carried out by the "nlp" function by default, but we can add it to the pipeline
# Add neural coref to SpaCy's pipe
import spacy
import neuralcoref
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)

# Another text...
doc = nlp('Angela lives in Boston. She is quite happy in that city.')
for ent in doc.ents:
    print(ent._.coref_cluster)

In [None]:
# spaCy has also the support for word vectors, or word embeddings
# The idea is that each word gets mapped into a latent, multidimensional space
# The mapping from words to n-dimensional vectors somehow preserves semantic content of words
# For instance, synonims should end up close in the latent space
# The latent representation also allows us to capture many semantic connections within words

# We need to download and employ another model for the English language to to that
!python -m spacy download en_core_web_md

nlp = spacy.load('en')

In [None]:
# Let us get the embeddings of the word "dog"

dog = nlp('dog')
print(dog.vector.shape)
print(dog.vector)

In [None]:
# We can now define a similarity function between arrays
def cosine_similarity(vec1, vec2):
  from scipy import spatial
  return 1 - spatial.distance.cosine(vec1,vec2)

# We observe that the wolf vector is the closest one to the dog vector, followed by those of bear and elephant
print(cosine_similarity(dog.vector, nlp('wolf').vector))
print(cosine_similarity(dog.vector, nlp('bear').vector))
print(cosine_similarity(dog.vector, nlp('elephant').vector))


In [20]:
# Now that we have seen some stuff regarding spaCy, we are going to consider a text classification task

# Let us now load our referece dataset, which is about reviews regarding Amazon's Alexa smart home speaker
# The data is available at: https://github.com/dslab-uniud/teaching/blob/main/courses/Data%20Management%20for%20Big%20Data/2021-2022/amazon_alexa.tsv 

# Download the TSV file, then, upload it into the Colab
# To do that, you can just drag the file onto the "file" area of the Colab

In [None]:
# Now, let us read the content of the file
# To such an extent, we are relying on Python's Pandas library
import pandas as pd

# Also, for our analysis tasks we are going to need some sklearn libraries
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


dataset = pd.read_csv("./amazon_alexa.tsv", sep="\t")
dataset['date'] = pd.to_datetime(dataset['date'])
display(dataset)

# Ok, our dataset is composed of 3150 reviews, each described by 5 columns: 
#  - rating denotes the rating each user gave the Alexa (out of 5)
#  - date indicates the date of the review
#  - variation describes which model the user reviewed
#  - verified_reviews contains the text of each review
#  - feedback contains a sentiment label, with 1 denoting positive sentiment (the user liked it) and 0 denoting negative sentiment (the user didn’t)


In [22]:
# Let us prepare predictor and label data and divide them into training and test splits

X = dataset['verified_reviews'].values
y = dataset['feedback'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
# Now, for text pre-processing purposes, as usual, we are loading our spaCy model
nlp = spacy.load('en', disable = ['ner'])

# Creating our custom tokenizer function, inspired from what we have seen above
def spacy_tokenizer(sentence):
  token_list = []
  for token in nlp(sentence):
    if not(token.is_punct or token.is_space or token.is_stop) and token.is_alpha:
      token_list.append(str(token.lemma_).lower())
  return token_list


In [None]:
# CountVectorizer converts a given text into a matrix of word occurrences, giving to it a structured representation
# It is also capable of handling n-grams instead of single words, by means of the parameter ngram_range=(lower_bound, upper_bound)
# We also ignore words that appear in less than 1% of the reviews (min_df)

vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), min_df=int(len(dataset)*0.01))
vectorizer.fit(X_train) # fitting the vectorizer on training set data only

# Vectorizing both trainig and test data
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [None]:
# The vectorizer builds a list of attributes
# Each sentence is going to be represented by the number of times these words occur in it

print(vectorizer.get_feature_names_out())

In [None]:
# We can also visualize the transformed dataset

X_array = X_train_vect.toarray()
display(pd.DataFrame(data=X_array, columns=vectorizer.get_feature_names()).iloc[30:35])

print(X_train[31]) # notice how both added and adding have been counted as occurrences of add

In [None]:
# Build the logistic model to predict the sentiment of a review

classifier = LogisticRegression()
classifier.fit(X_train_vect, y_train)

preds = classifier.predict(X_test_vect)

from sklearn.metrics import f1_score
print(f1_score(preds, y_test))

In [28]:
# Now you can try for example the following:
#   - what happens if you consider n-grams instead of single words?
#   - and if you replace CountVectorizer with TfidfVectorizer? https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#   - can we try to predict the rating instead of the feedback? Maybe through a regression task?