<a href="https://colab.research.google.com/github/d-tomas/text-mining/blob/main/notebooks/lecture_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Day 2**: Document Representation

## Initial setup

In [None]:
# Import the required libraries

import gensim  # Word embedding models
from gensim.models import KeyedVectors  # Load pre-trained word embedding models
import matplotlib.pyplot as plt  # Display word clouds
import nltk  # NLP library
from nltk.stem.porter import *  # Stemmer tool
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer  # Term by document matrix with TF
from sklearn.feature_extraction.text import TfidfVectorizer  # Term by document matrix with TF-IDF
import spacy  # NLP library
from wordcloud import WordCloud  # Create word clouds

# Install the SpaCy model for English texts
spacy.cli.download('en_core_web_sm')

# Load the model
nlp = spacy.load('en_core_web_sm')

# Download example text files ('news.txt', 'paper.txt' and 'repec_s.csv')
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/news.txt
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/paper.txt
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/repec_s.csv

# Download a pre-trained word embedding model with 100 billion words from Google News
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin.gz  # Unzip the model

## Example 1: n-gram extraction


In [None]:
# Extract bigrams and trigrams from text

with open('news.txt') as file:
    content = file.read()

list_bigrams = nltk.ngrams(content.split(), 2)  # split() the sentence into a list of words
list_trigrams = nltk.ngrams(content.split(), 3)

print('---------')
print('Bigrams:')
print('---------')
for bigram in list_bigrams:
  print(bigram)

print('----------')
print('Trigrams:')
print('----------')
for trigram in list_trigrams:
  print(trigram)

In [None]:
# The previous approach does not consider sentence boundaries
# We can read the file line by line and extract n-grams for each line separately

with open('news.txt') as file:
    content = file.readlines()  # Get a list of lines

# Remove empty lines, blanks and new line characters
content = [line.strip() for line in content if line.strip()]

for line in content:
    trigrams = nltk.ngrams(line.split(), 3)  # Extract 3-grams for each line
    for trigram in trigrams:
        print(trigram)

### Exercise

In [None]:
# Repeat the analysis on 'paper.txt', obtaining also 4-grams and 5-grams in addition to bigrams and trigrams
# Use the first procedure (no need to consider sentence boundaries)


## Example 2: normalisation / pre-processing

In [None]:
# Remove punctuation, lowercase, remove stopwords and get the stem of the words

text = 'The Netherlands earned sweet revenge on Spain on Friday at the Fonte Nova in Salvador, hammering Spain 5-1 to put an emphatic coda on their loss in the 2010 World Cup finals.'

document = nlp(text)  # Process the text with SpaCy

document = [token for token in document if not token.is_punct]  # Remove punctuation
print('No punctuation: ' + str(document))

document = [token for token in document if not token.is_stop]  # Remove stopwords
print('No stopwords: ' + str(document))

document = [token.lower_ for token in document]  # Lowercase
print('Lowercased: ' + str(document))

stemmer = PorterStemmer()
document = [stemmer.stem(token) for token in document]  # Stem of the words
print('Stems: ' + str(document))

### Exercise

In [None]:
# Repeat the previous analysis on the content of 'paper.txt'


## Example 3: weighting schema

In [None]:
# Build the term by document matrix using the TF weighting schema

corpus = ['I do not like this restaurant', 'I like this restaurant very much', 'I think it is a very very bad place', 'I love this place']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())
print(X.shape)

vectorizer2 = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))  # Extract bigrams
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())
print(X2.shape)

In [None]:
# Build the term by document matrix using the TF-IDF weighting schema

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

### Exercise

In [None]:
# Get the term by document matrix, using TF weighting schema and trigrams on 'news.txt'

## Example 4: word embeddings

In [None]:
# Load the model into memory

model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
# Show the vector representing a word

model['dog']

In [None]:
# Check the size of the returned vector

len(model['dog'])

In [None]:
# Get the 5 most similar words to a given one 

model.most_similar('monkey', topn = 5)

In [None]:
# Analogy: 'France' is to 'Paris' as 'Madrid' is to... (France - Paris + Madrid)

model.most_similar(positive=['Madrid', 'France'], negative=['Paris'], topn=1)

In [None]:
# Ditch unrelated terms

model.doesnt_match(['Wine', 'Beer', 'Coke', 'Whysky'])

In [None]:
# Similarity between words
# Beware of algorithmic bias!!

model.similarity('woman', 'housework')

## Example 5: word cloud

In [None]:
# The file 'repec_s.csv' contains 5,000 references to scientific papers in the field of Economics

data = pd.read_csv('repec_s.csv')  # Store the data in a DataFrame
data

In [None]:
# Create a word cloud with the titles

list_lines = data['title']  # Store all the lines of the titles

corpus = nlp(' '.join(list_lines))  # Concatenate all the sentences in one string
tokens = [w.lower_ for w in corpus if (not w.is_space and not w.is_punct and not w.is_stop)]  # Lowercase removing blanks, punctuation and stopwords
corpus = ' '.join(tokens)  # Join again all the words in one string

wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(corpus)  # Create word cloud

# Configuration of the word cloud display
plt.figure(figsize=(10, 10), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off') 
plt.tight_layout(pad=0) 
plt.show()

## Example 6: authors and categories analysis

In [None]:
# List of most prolific authors

list_authors = []

for new_list in data['author'].str.split('|'):  # Multiple authors are separated by '|'
    for author in new_list:
        list_authors.append(author)  # List with all the occurrences of authors

series_authors = pd.Series(list_authors)  # Create a Series for further manipulation

series_authors.value_counts()[:20]  # Top 20 authors

In [None]:
# Create a word cloud with the 'abstracts' of a specific author
# Play around! Try a different author

author = 'Henri Sterdyniak'

list_lines = data[data['author'].str.contains(author)]['abstract'] # Store all the lines of the titles

corpus = nlp(' '.join(list_lines))  # Concatenate all the sentences in one string
tokens = [w.lower_ for w in corpus if (not w.is_space and not w.is_punct and not w.is_stop)]  # Lowercase removing blanks, punctuation and stopwords
corpus = ' '.join(tokens)  # Join again all the words in one string

wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(corpus)  # Create word cloud
  
plt.figure(figsize=(8, 8), facecolor=None)  # Display the word cloud in an image
plt.imshow(wordcloud) 
plt.axis('off') 
plt.tight_layout(pad = 0) 
plt.show()

In [None]:
# Create a word cloud for specific JEL cathegories
# The JEL classification system was developed for use in the Journal of Economic Literature (JEL)
# Itis a standard method of classifying scholarly literature in the field of economics
# F: International Economics
# I: Health, Education, and Welfare
# M: Business Administration and Business Economics | Marketing | Accounting | Personnel Economics
# R: Urban, Rural, Regional, Real Estate, and Transportation Economics
# Play around! Try a different JEL code

jel_code = 'F'

list_lines = data[data['jel'] == jel_code]['title']  # Store all the lines of the titles

corpus = nlp(' '.join(list_lines))  # Concatenate all the sentences in one string
tokens = [w.lower_ for w in corpus if (not w.is_space and not w.is_punct and not w.is_stop)]  # Lowercase removing blanks, punctuation and stopwords
corpus = ' '.join(tokens)  # Join again all the words in one string

wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(corpus)  # Create word cloud
  
plt.figure(figsize=(8, 8), facecolor=None)  # Display the word cloud in an image
plt.imshow(wordcloud) 
plt.axis('off') 
plt.tight_layout(pad = 0) 
plt.show()

# References

* [RePEC](http://www.repec.org/)
* [JEL Classification System](https://www.aeaweb.org/econlit/jelCodes.php?view=jel)
