# Information Value

En esta notebook vamos a testear Information Value definido por Zanette & Montemurro.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from __future__ import division, print_function
import pandas as pd
import numpy as np
import nltk
import math
from nltk.corpus import stopwords, gutenberg 


def is_punctuation(c):
        return (len(c) == 1 and (c in "-.'?!,\":;()|-/")) or c == '""' or  len(c) == 1 or c == '--' or c == ').' or c == '.""' or c == ''
    
def tokenize(text, only_alpha = False, only_alphanum = True,  clean_stop_words = False, clean_punctuation = True):  
    """
    Tokeniza text sacando alfanuméricos, stopwords y puntuación de ser necesario
    """
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [t for t in tokens if  (not clean_punctuation or not is_punctuation(t)) 
        and (not only_alpha or t.isalpha())
        and (not only_alphanum or t.isalnum())
        and (not clean_stop_words or t not in stopwords.words('english'))]          

    return tokens



def get_moby_dick_tokens():
    moby_dick = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
    tokens = tokenize(moby_dick, only_alphanum=True, clean_punctuation=True)
    return [token.lower() for token in tokens]


In [2]:
md_tokens = get_moby_dick_tokens()

fd = nltk.FreqDist(md_tokens)

In [4]:
from scipy.stats import entropy

window_size = 5000

def ocurrence_dataframe(tokenized_text, window_size):
    """
    Construye una matriz de ocurrencias dado un tamaño de ventana
    """
    freq = {}
    """
    N = Largo del texto
    P = Cantidad de ventanas
    """
    N = len(tokenized_text)
    P = int(math.ceil(N / window_size))

    for i in range(0, P):
        """
        Para cada ventana: hago un Fdist de la ventana, y lo sumo a las palabras que ya tenga
        """
        lower_bound, upper_bound = window_size * i, min(window_size* (i+1), N) 
        window = tokenized_text[lower_bound:upper_bound]
        window_fdist = nltk.FreqDist(window)

        for word in window_fdist:
            if word not in freq:
                freq[word] = [0] * P
                
            freq[word][i] = window_fdist[word]
            
    df = pd.DataFrame.from_dict(freq, orient="index")
    
    df["entropy"] = df.apply(entropy, axis=1, raw=True)
    df["total"] = df[range(P)].apply(sum, axis=1, raw=True)
    df["freq"] = df["total"] / len(tokenized_text)
    
    
    return df

occurrence_df = ocurrence_dataframe(md_tokens, window_size)

occurrence_df.sort_values(by="entropy", inplace=True)

In [5]:
occurrence_df.iloc[:100]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,entropy,total,freq
funereal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
dreamt,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
thickens,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
ramparts,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
spiralizations,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
conflagration,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
prating,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
feegee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
aboriginalness,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005
overhung,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,1,0.000005


# Problema de la entropía a secas

Observemos que acá el problema es que me quedan ordenadas primero las palabras con baja frecuencia: más aún, las que aparecen una vez primeras que todos.

In [6]:
import random

shuffled_text = random.shuffle(md_tokens)

shuffled_df = ocurrence_dataframe(md_tokens, window_size)

shuffled_df[shuffled_df.total > 10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,entropy,total,freq
yellow,1,0,0,1,1,1,1,1,0,0,...,0,0,0,1,0,0,0,2.894400,23,0.000110
four,0,1,2,1,0,3,1,4,3,3,...,3,3,3,3,2,1,0,3.432398,74,0.000353
hanging,0,0,0,1,0,0,1,0,0,0,...,1,0,0,1,1,0,2,2.718473,20,0.000095
lord,0,3,0,2,4,1,2,1,0,1,...,3,1,1,1,3,2,1,3.448940,66,0.000315
sinking,0,0,0,0,1,1,1,0,0,0,...,0,0,1,0,1,1,1,2.564949,13,0.000062
oceans,1,0,1,0,0,0,0,2,0,1,...,0,1,0,1,0,0,1,2.512659,16,0.000076
foul,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,2.271869,11,0.000053
receiving,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,2.204785,13,0.000062
wooden,0,1,0,0,1,3,1,1,0,1,...,0,0,0,1,1,1,1,2.949012,27,0.000129
nigh,1,1,1,3,2,1,2,2,0,0,...,1,1,2,1,0,0,0,3.287805,44,0.000210


In [7]:

occurrence_df["shuffled_entropy"] = shuffled_df["entropy"]

occurrence_df["information_value"] = occurrence_df.freq * abs(occurrence_df.shuffled_entropy - occurrence_df.entropy)

In [8]:
occurrence_df.sort_values("information_value", ascending=False, inplace=True)

occurrence_df.iloc[:40]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,entropy,total,freq,shuffled_entropy,information_value
whale,76,9,1,9,5,9,2,7,26,8,...,3,10,25,30,32,3.437954,1226,0.005854,3.722656,0.001667
ahab,0,0,0,0,0,4,15,14,1,31,...,33,32,41,40,34,3.129364,511,0.00244,3.688817,0.001365
queequeg,0,0,22,1,33,32,28,38,3,2,...,0,3,1,0,2,2.680595,252,0.001203,3.647151,0.001163
is,63,34,20,66,36,18,14,19,40,23,...,32,13,23,30,16,3.592272,1725,0.008237,3.725707,0.001099
ye,5,4,6,6,3,21,17,40,25,11,...,20,16,11,21,19,3.230246,472,0.002254,3.689643,0.001035
you,32,46,22,6,19,23,18,35,10,17,...,19,13,3,0,2,3.48081,894,0.004269,3.707978,0.00097
thou,2,3,0,6,0,18,13,12,9,2,...,41,19,3,7,16,2.934329,271,0.001294,3.661295,0.000941
me,16,35,49,12,40,20,39,21,9,30,...,21,29,13,15,28,3.403535,633,0.003022,3.703379,0.000906
stubb,0,0,0,0,0,0,0,1,9,23,...,6,8,2,17,10,2.95969,257,0.001227,3.651349,0.000849
bildad,0,0,0,0,0,3,38,25,10,0,...,0,0,0,0,0,1.106762,76,0.000363,3.421901,0.00084


# Cálculo de Information Value sin usar un shuffle de texto

El cálculo hecho depende de un cálculo 

# En nuestro caso...


In [9]:
import re
from scipy.stats import entropy

df = pd.read_csv("../contrastes/provincias.csv", quotechar="\"", decimal=",")

df = df.set_index("palabra")


fnorm_vars = [c for c in df.columns if re.match(r'fnorm_.*', c)]
cant_palabras = [c for c in df.columns if re.match(r'.*Palabras', c)]


In [None]:
"""
Método que calcula la entropía de una palabra (usando df)
"""

import math


def w_entropy(df, word):
    # Esto es porque se mambea con los tipos (creo)
    vec = df.loc[word][cant_palabras].tolist()
    
    return entropy(vec)

entropy([1] * 10000)

In [None]:
from scipy.stats import hypergeom

word = "anga"

n = df.loc[word].cantPalabra
N = df.cantPalabra.sum()
P = 23 # cantidad de provincias

hv = hypergeom(M=N, N=N/P, n=n)

shuffled_entropy = -P * sum([hv.pmf(m) * (float(m)/n) * np.log2(float(m)/n) for m in xrange(1, min(n, N  / P))])

print shuffled_entropy

In [None]:
import numpy as np

def expected_random_shuffle(df, word):
    """
    Apéndice último del paper de Zanette
    """
    n = df.loc[word].cantPalabra
    N = df.cantPalabra.sum()
    P = 23 # cantidad de provincias

    hv = hypergeom(M=N, N=N/P, n=n)

    return -P * sum([hv.pmf(m) * (float(m)/n) * np.log2(float(m)/n) for m in xrange(1, min(n, N  / P))])

    

def information_value(df, word):
    
    vec = df.loc[word][cant_palabras].tolist()
    n = sum(vec)
    
    freq = float(df.loc[word].cantPalabra) / (df.cantPalabra.sum())
    word_entropy = w_entropy(df, word)
    
    shuffled_entropy = expected_random_shuffle(df, word)

    #print(word)
    #print("entropía = {} entropía shuffle = {}".format(word_entropy, shuffled_entropy))

    res = np.log2(1+freq) * (shuffled_entropy - word_entropy) 
    
    return res
    

In [None]:
test_words = [
    "anga",
    "culiaw",
    "despues",
    "mitai",
    "artante",
    "q",
    "como",
    "ver",
    "de",
    "nah"]

for word in test_words:
    print word, information_value(df,word)

In [None]:
df["iv"] = map(lambda word: information_value(df, word), df.index)
df["entropy"] = map(lambda word: w_entropy(df, word), df.index)

In [None]:
df = df.sort(columns="entropy")

In [None]:
df[df.cantPalabra > 150][["entropy", "iv", "cantPalabra", "provinciaFnormMax"]].to_csv("prueba.csv")