# Opinion Spam - Text Representation

In [1]:
import os 

import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
sentences = [
    'I ate an apple',
    'I ate an orange',
    'I ate a banana',
    'I had an apple',
    'I had a pear',
]

In [27]:
def text2dataframe(sentences, vectorizer):
    vectorizer.fit(sentences)
    
    df = pd.DataFrame(
        vectorizer.transform(sentences).todense(),
        columns=vectorizer.get_feature_names_out().tolist()
    )
    
    df.index.name = 'document_id'
    
    return df

In [28]:
print('\n'.join(sentences))

I ate an apple
I ate an orange
I ate a banana
I had an apple
I had a pear


In [29]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

df = text2dataframe(sentences, vectorizer)

df.style.background_gradient(
    cmap='Greys', low=0, high=1, axis=None
)  

Unnamed: 0_level_0,an,apple,ate,banana,had,orange,pear
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,1,1,0,0,0,0
1,1,0,1,0,0,1,0
2,0,0,1,1,0,0,0
3,1,1,0,0,1,0,0
4,0,0,0,0,1,0,1


**[Question]** 

Why were "I" and "a" ignored?

In [39]:
# Try to answer here ... 

## Cosine Similarity 

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

pd.DataFrame(
    cosine_similarity(df, df),
    index=sentences,
    columns=sentences,
).style.background_gradient(
    cmap='YlGn', low=0, high=1, axis=None
)  

Unnamed: 0,They ate an apple,They ate an orange,They ate a banana,They had an apple,They had a pear
They ate an apple,1.0,0.75,0.57735,0.75,0.288675
They ate an orange,0.75,1.0,0.57735,0.5,0.288675
They ate a banana,0.57735,0.57735,1.0,0.288675,0.333333
They had an apple,0.75,0.5,0.288675,1.0,0.57735
They had a pear,0.288675,0.288675,0.333333,0.57735,1.0


## n-grams 

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2,2))

df = text2dataframe(sentences, vectorizer)

df.style.background_gradient(
    cmap='Greys', low=0, high=1, axis=None
)  

Unnamed: 0_level_0,an apple,an orange,ate an,ate banana,had an,had pear,they ate,they had
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,0,1,0,0,0,1,0
1,0,1,1,0,0,0,1,0
2,0,0,0,1,0,0,1,0
3,1,0,0,0,1,0,0,1
4,0,0,0,0,0,1,0,1


## Characters vs Words

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,1))

df = text2dataframe(sentences, vectorizer)

df.style.background_gradient(
    cmap='Greys', low=0, high=1, axis=None
)  

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,d,e,g,h,i,l,n,o,p,r,t
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,3,3,0,0,2,0,0,1,1,1,0,2,0,1
1,3,3,0,0,2,1,0,1,0,2,1,0,1,1
2,3,5,1,0,1,0,0,1,0,2,0,0,0,1
3,3,3,0,1,1,0,1,1,1,1,0,2,0,0
4,3,3,0,1,1,0,1,1,0,0,0,1,1,0


## TF-IDF

**TF** means `term-frequency` while **IDF** means `inverse document-frequency`

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

df = text2dataframe(sentences, vectorizer)

df.style.background_gradient(
    cmap='Greys', low=0, high=1, axis=None
)  

Unnamed: 0_level_0,an,apple,ate,banana,had,orange,pear
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.538283,0.648463,0.538283,0.0,0.0,0.0,0.0
1,0.48624,0.0,0.48624,0.0,0.0,0.726044,0.0
2,0.0,0.0,0.556451,0.830881,0.0,0.0,0.0
3,0.506204,0.609818,0.0,0.0,0.609818,0.0,0.0
4,0.0,0.0,0.0,0.0,0.627914,0.0,0.778283


In [47]:
from sklearn.metrics.pairwise import cosine_similarity

pd.DataFrame(
    cosine_similarity(df, df),
    index=sentences,
    columns=sentences,
).style.background_gradient(
    cmap='YlGn', low=0, high=1, axis=None
)  

Unnamed: 0,I ate an apple,I ate an orange,I ate a banana,I had an apple,I had a pear
I ate an apple,1.0,0.523469,0.299528,0.667925,0.0
I ate an orange,0.523469,1.0,0.270569,0.246137,0.0
I ate a banana,0.299528,0.270569,1.0,0.0,0.0
I had an apple,0.667925,0.246137,0.0,1.0,0.382913
I had a pear,0.0,0.0,0.0,0.382913,1.0


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

df = text2dataframe(sentences, vectorizer)

df.style.background_gradient(
    cmap='Greys', low=0, high=1, axis=None
)  

Unnamed: 0_level_0,apple,ate,banana,orange,pear
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.769447,0.638711,0.0,0.0,0.0
1,0.0,0.556451,0.0,0.830881,0.0
2,0.0,0.556451,0.830881,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0


In [49]:
from sklearn.metrics.pairwise import cosine_similarity

pd.DataFrame(
    cosine_similarity(df, df),
    index=sentences,
    columns=sentences,
).style.background_gradient(
    cmap='YlGn', low=0, high=1, axis=None
)  

Unnamed: 0,I ate an apple,I ate an orange,I ate a banana,I had an apple,I had a pear
I ate an apple,1.0,0.355411,0.355411,0.769447,0.0
I ate an orange,0.355411,1.0,0.309637,0.0,0.0
I ate a banana,0.355411,0.309637,1.0,0.0,0.0
I had an apple,0.769447,0.0,0.0,1.0,0.0
I had a pear,0.0,0.0,0.0,0.0,1.0


# Comparing Apples and Oranges

## Word Embedding

In [130]:
vector_data = {
    "dog":    [0.9, 0.0, 0.0],
    "cat":    [0.8, 0.0, 0.0],
    "orange": [0.0, 0.5, 0.5],
    "apple":  [0.0, 0.8, 0.0],
    "blue":   [0.0, 0.0, 0.8],
    "red":    [0.0, 0.0, 0.9],
}

In [137]:
cosine_similarity(
    np.array(vector_data["dog"]).reshape((1, 3)),
    np.array(vector_data["cat"]).reshape((1, 3)),
).round(4)[0,0] 

1.0

In [139]:
cosine_similarity(
    np.array(vector_data["dog"]).reshape((1, 3)),
    np.array(vector_data["orange"]).reshape((1, 3)),
).round(4)[0,0] 

0.0

In [140]:
cosine_similarity(
    np.array(vector_data["red"]).reshape((1, 3)),
    np.array(vector_data["orange"]).reshape((1, 3)),
).round(4)[0,0] 

0.7071

In [141]:
cosine_similarity(
    np.array(vector_data["blue"]).reshape((1, 3)),
    np.array(vector_data["orange"]).reshape((1, 3)),
).round(4)[0,0] 

0.7071

## SpaCy Word Vectors

In [142]:
import spacy

In [143]:
nlp = spacy.load('en_core_web_lg')

fruits = ["apple", "apples", "orange", "pear", "fruit", "blue"]
tokens = {fruit: nlp(fruit).vector for fruit in fruits}

In [144]:
cosine_similarity(
    tokens['apple'].reshape((1, 300)),
    tokens['apple'].reshape((1, 300)),
).round(2)[0,0]

1.0

In [145]:
cosine_similarity(
    tokens['apple'].reshape((1, 300)),
    tokens['apples'].reshape((1, 300)),
).round(2)[0,0]

0.75

In [146]:
cosine_similarity(
    tokens['apple'].reshape((1, 300)),
    tokens['orange'].reshape((1, 300)),
).round(2)[0,0]

0.56

In [147]:
cosine_similarity(
    tokens['apple'].reshape((1, 300)),
    tokens['pear'].reshape((1, 300)),
).round(2)[0,0]

0.61

In [148]:
cosine_similarity(
    tokens['blue'].reshape((1, 300)),
    tokens['apple'].reshape((1, 300)),
).round(2)[0,0]

0.35

In [149]:
cosine_similarity(
    tokens['blue'].reshape((1, 300)),
    tokens['orange'].reshape((1, 300)),
).round(2)[0,0]

0.77

# Exercise 

Time to iterate on the execercise at the end of ["2- Text Classification"](https://github.com/gr33ndata/jonthebeach/blob/main/notebooks/2-%20Text%20Classification.ipynb)

- It's up to you to classify the documents based on their `polarity` or the `deceptive` label.
- Try TF-IDF vectorizer instead of the Count Vectorizer this time. And compare the classifier's accuracy.
- Try different values for `ngram_range`, say 1-gram, combination of 1-and 2-grams, and 3-grams
- Which words correlate with positive polarity (or with deceptiveness) 