# Text and Tabular Data

In [13]:
import numpy as np
import pandas as pd
import glob
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


## Term Frequency-Inverse Document Frequency (TF-IDF) Model

* It selects documents that have the query words appearing many times, where rarely occurring words are given greater importance
* <code>idf<sub>i</sub> = log(m / df<sub>i</sub>)</code>, where `m` is the number of documents and <code>df<sub>i</sub></code> is the document frequency (number of documents containing <code>w<sub>i</sub></code>)
* A document that contains the words <code><w<sub>i</sub>, ..., w<sub>k</sub>></code> is represented as vector. Note that <code>w<sub>i</sub> = tf<sub>i</sub> * idf<sub>i</sub></code>
* A word is important if it appears in the document multiple times, but doesn't appear in many documents
* <code>weight<sub>i</sub> = f<sub>i</sub> / max(f<sub>1</sub>,...,f<sub>k</sub>) * log<sub>2</sub>(m / df<sub>i</sub>)</code>
* Compute the vector for each document and each query, then apply TF-IDF to normalize these vectors (the normalization formula is different for documents and queries), then compute the distance between each query vector and each document vector and use the result for ranking

### Modification for Queries

* <code>weight<sub>i</sub> =(0.5 + 0.5 * f<sub>i</sub> / max(f<sub>1</sub>,...,f<sub>k</sub>)) * log<sub>2</sub>(m / df<sub>i</sub>)</code> for each word in the query
* `weight = 0` if word doesn't appear in the query
* `m` is the number of documents
* <code>df<sub>i</sub></code> is the number of documents that contain the word <code>w<sub>i</sub></code>
* `df = 0` if the term does not appear in any document
* The formula helps make the difference between a term that appears once in the query versus a word that appears 10 times less extreme

### Example

![title](./pic/tfidfexample.png)

`weight("example", d1) = 0`

<code>weight("another", d2) = 2/3 * log<sub>2</sub>(2/1) = 2/3</code>

`q1 = "example of a simple example"`

<code>weight("simple", q1) = (0.5 + 0.5 * (1/2)) * log<sub>2</sub>(2/1) = 0.75</code>


## Getting Documents

In [7]:
document_files = glob.glob('./text/trump/*.txt')
docs = []
for file in document_files:
    with open(file,'r') as f:
        name = os.path.basename(file.split('.')[1])
        docs.append({'document':name,'lines':f.readlines()})
df = pd.DataFrame(docs)
df['text'] = df.apply(lambda row: ' '.join(row['lines']), axis=1)
display(df)

Unnamed: 0,document,lines,text
0,speech_0,[Remarks Announcing Candidacy for President in...,Remarks Announcing Candidacy for President in ...
1,speech_1,[Remarks at the AIPAC Policy Conference in Was...,Remarks at the AIPAC Policy Conference in Wash...
2,speech_10,[Remarks at the Washington County Fair Park in...,Remarks at the Washington County Fair Park in ...
3,speech_11,[Remarks at the Charlotte Convention Center in...,Remarks at the Charlotte Convention Center in ...
4,speech_12,"[Remarks at Luedecke Arena in Austin, Texas\n,...","Remarks at Luedecke Arena in Austin, Texas\n O..."
5,speech_13,"[Remarks to the American Legion in Cincinnati,...","Remarks to the American Legion in Cincinnati, ..."
6,speech_14,"[Remarks at the Roberts Centre in Wilmington, ...","Remarks at the Roberts Centre in Wilmington, O..."
7,speech_15,[Remarks at Great Faith International Ministri...,Remarks at Great Faith International Ministrie...
8,speech_16,[Remarks at a Rally at the Greenville Conventi...,Remarks at a Rally at the Greenville Conventio...
9,speech_17,[Remarks at the Union League of Philadelphia i...,Remarks at the Union League of Philadelphia in...


## Parsing the Documents

In [15]:
df['words'] = df.text.str.strip().str.split('[\W]+') # removes leading/trailing space, split returns an array of words
stop_words = list(stopwords.words('english'))
result = []
for i in range(0,len(df)):
    for word in df.iloc[i]['words']:
        if (word.lower() not in stop_words and word != ''):
            result.append((df.iloc[i]['document'], word.lower()))
words = pd.DataFrame(result, columns=['document','word'])
display(words)

Unnamed: 0,document,word
0,speech_0,remarks
1,speech_0,announcing
2,speech_0,candidacy
3,speech_0,president
4,speech_0,new
...,...,...
75517,speech_9,bless
75518,speech_9,thank
75519,speech_9,thank
75520,speech_9,thank


## Computing the Frequencies

* `groupby` creates a group for each document
* `value_counts` counts the number of occurrences of each word in a group
* `to_frame` converts Series to a DataFrame
* `rename` sets the attributes
* MultiIndex includes document and word

In [24]:
counts = words.groupby('document')['word'].value_counts().to_frame().rename(columns={'word':'frequency'})
display(count)

Unnamed: 0_level_0,Unnamed: 1_level_0,word
document,word,Unnamed: 2_level_1
speech_0,â,63
speech_0,trump,57
speech_0,people,55
speech_0,know,46
speech_0,going,44
...,...,...
speech_9,wonderful,1
speech_9,workers,1
speech_9,wounding,1
speech_9,wrong,1


## Computing TF

In [25]:
max_frequency = counts.groupby('document').max().rename(columns={'frequency':'maxFreq'})
tf = counts.join(max_frequency)
tf['tf'] = tf['frequency'] / tf['maxFreq']
display(tf)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,maxFreq,tf
document,word,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
speech_0,â,63,63,1.000000
speech_0,trump,57,63,0.904762
speech_0,people,55,63,0.873016
speech_0,know,46,63,0.730159
speech_0,going,44,63,0.698413
...,...,...,...,...
speech_9,wonderful,1,46,0.021739
speech_9,workers,1,46,0.021739
speech_9,wounding,1,46,0.021739
speech_9,wrong,1,46,0.021739


## Computing IDF

In [27]:
doc_count = df['document'].nunique()
doc_freq = words.groupby('word')['document'].nunique().to_frame().rename(columns={'document':'df'})
print(doc_count)
display(doc_freq)

56


Unnamed: 0_level_0,df
word,Unnamed: 1_level_1
0,3
00,1
000,50
05,1
055,1
...,...
yuma,1
zero,5
zone,2
zones,3


## Computing TF-IDF

In [28]:
doc_freq['idf'] = np.log2(doc_count / doc_freq['df'].values)
result = tf.join(doc_freq)
result['tfidf'] = result['tf'] * result['idf']
display(result)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,maxFreq,tf,df,idf,tfidf
document,word,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
speech_0,â,63,63,1.000000,53,0.079434,0.079434
speech_0,trump,57,63,0.904762,46,0.283793,0.256765
speech_0,people,55,63,0.873016,56,0.000000,0.000000
speech_0,know,46,63,0.730159,40,0.485427,0.354439
speech_0,going,44,63,0.698413,53,0.079434,0.055478
...,...,...,...,...,...,...,...
speech_9,wonderful,1,46,0.021739,18,1.637430,0.035596
speech_9,workers,1,46,0.021739,38,0.559427,0.012161
speech_9,wounding,1,46,0.021739,2,4.807355,0.104508
speech_9,wrong,1,46,0.021739,16,1.807355,0.039290
