# Overview
**Perform text mining**
- Construct a climate change regulatory news index. 
  - Use a `search engine` to compute numerical statistic known as "term frequency - inverse document frequency) or `TF-IDF`
  - Reflect how `important a word is` to a document in a collection of `corpus` from Wall Street Journal (WSJ).

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(context='notebook', style='white')

# Part 1

## Create a term-document matrix

In [None]:
# Create a corpus of documents, each document a paragraph or a sentence
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']

# Stop words, stemming, and tokenizing
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, ngram_range=(1, 3))

# Compute the term-document matrix with n-grams
tdm = vectorizer.fit_transform(corpus)

# The term-document matrix has a word and a document
print(f"Term-document matrix: \n{tdm}")

In [7]:
# The row index is the word
print(f"Row index: \n{vectorizer.get_feature_names()}")

Row index: 
['ate', 'ate sandwich', 'ate sandwich wizard', 'dog', 'dog ate', 'dog ate sandwich', 'sandwich', 'sandwich ate', 'sandwich ate sandwich', 'sandwich wizard', 'sandwich wizard transfigured', 'transfigured', 'transfigured sandwich', 'transfigured sandwich ate', 'wizard', 'wizard transfigured', 'wizard transfigured sandwich']


In [8]:
# The column index is the document
print(f"Column index: \n{vectorizer.get_feature_names()}")

Column index: 
['ate', 'ate sandwich', 'ate sandwich wizard', 'dog', 'dog ate', 'dog ate sandwich', 'sandwich', 'sandwich ate', 'sandwich ate sandwich', 'sandwich wizard', 'sandwich wizard transfigured', 'transfigured', 'transfigured sandwich', 'transfigured sandwich ate', 'wizard', 'wizard transfigured', 'wizard transfigured sandwich']


In [34]:
# Create a dataframe with the TDM
tdm_df = pd.DataFrame(tdm.toarray(), columns=vectorizer.get_feature_names(), index=['doc_1'])
tdm_df.to_csv('tdm.csv')

In [24]:
tdm_df['wizard']

doc_1    0.179605
Name: wizard, dtype: float64

## Extract the TF-IDF for a word in a document

In [13]:
# The term frequency is the number of times a word appears in a document
print(f"The term frequency for 'sandwich' is {tdm_df.loc['doc_1']['sandwich']}")

The term frequency for 'sandwich' is 0.5388159060803247


In [16]:
# The inverse document frequency is the number of documents divided by the number of documents that contain the word
print(f"The inverse document frequency for 'sandwich' is {vectorizer.idf_[vectorizer.vocabulary_['sandwich']]}")

The inverse document frequency for 'sandwich' is 1.0


In [18]:
# The TF-IDF is the product of the TF and IDF
print(f"The TF-IDF for 'sandwich' is {tdm_df.loc['doc_1']['sandwich'] * vectorizer.idf_[vectorizer.vocabulary_['sandwich']]}")

The TF-IDF for 'sandwich' is 0.5388159060803247
