# Information Value

En esta notebook vamos a testear Information Value definido por Zanette & Montemurro.

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from __future__ import division, print_function
import pandas as pd
import numpy as np
import nltk
import math
from nltk.corpus import stopwords, gutenberg 


def is_punctuation(c):
        return (len(c) == 1 and (c in "-.'?!,\":;()|-/")) or c == '""' or  len(c) == 1 or c == '--' or c == ').' or c == '.""' or c == ''
    
def tokenize(text, only_alpha = False, only_alphanum = True,  clean_stop_words = False, clean_punctuation = True):  
    """
    Tokeniza text sacando alfanuméricos, stopwords y puntuación de ser necesario
    """
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [t for t in tokens if  (not clean_punctuation or not is_punctuation(t)) 
        and (not only_alpha or t.isalpha())
        and (not only_alphanum or t.isalnum())
        and (not clean_stop_words or t not in stopwords.words('english'))]          

    return tokens



def get_moby_dick_tokens():
    moby_dick = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
    tokens = tokenize(moby_dick, only_alphanum=True, clean_punctuation=True)
    return [token.lower() for token in tokens]


In [3]:
md = get_moby_dick_tokens()

fd = nltk.FreqDist(md)

In [4]:

def ocurrence_dataframe(tokenized_text, window_size):
    """
    Construye una matriz de ocurrencias dado un tamaño de ventana
    """
    freq = {}
    """
    N = Largo del texto
    P = Cantidad de ventanas
    """
    N = len(tokenized_text)
    P = int(math.ceil(N / window_size))

    for i in range(0, P):
        lower_bound, upper_bound = window_size * i, min(window_size* (i+1), N) 
        window = tokenized_text[lower_bound:upper_bound]
        window_fdist = nltk.FreqDist(window)

        for word in window_fdist:
            if word not in freq:
                freq[word] = [0] * P
                
            freq[word][i] = window_fdist[word]
    return pd.DataFrame.from_dict(freq, orient="index")

occurrence_df = ocurrence_dataframe(md, 5000)

In [11]:
from scipy.stats import entropy

occurrence_df["entropy"] = occurrence_df.apply(entropy, axis=1, raw=True)

In [12]:
occurrence_df.sort_values(by="entropy", inplace=True)

In [14]:
occurrence_df.index[:100]

Index([u'funereal', u'dreamt', u'thickens', u'ramparts', u'spiralizations',
       u'conflagration', u'prating', u'feegee', u'aboriginalness', u'overhung',
       u'platters', u'repelling', u'preparative', u'fulller', u'pail',
       u'oxygenated', u'tasks', u'normal', u'discreet', u'bestreaked',
       u'thews', u'dugongs', u'scrupulously', u'sentence', u'overmanned',
       u'poniards', u'identify', u'gauntleted', u'1775', u'1776',
       u'shakespeare', u'belfast', u'1772', u'agile', u'defray', u'bladder',
       u'fumes', u'trimming', u'beholds', u'enjoyments', u'pupella', u'claims',
       u'conquering', u'bamboozle', u'salisbury', u'doctrine', u'armada',
       u'stall', u'cones', u'snorts', u'crushing', u'jealousy', u'approximate',
       u'brawniness', u'vacated', u'starve', u'marten', u'evinces',
       u'entombment', u'placeless', u'analyse', u'scimetars', u'amounted',
       u'formerly', u'subs', u'thinkest', u'upheaving', u'translated', u'cone',
       u'mistifying', u'peri

# En nuestro caso...


In [None]:
import re
from scipy.stats import entropy

df = pd.read_csv("../contrastes/provincias.csv", quotechar="\"", decimal=",")

df = df.set_index("palabra")


fnorm_vars = [c for c in df.columns if re.match(r'fnorm_.*', c)]
cant_palabras = [c for c in df.columns if re.match(r'.*Palabras', c)]


In [None]:
"""
Método que calcula la entropía de una palabra (usando df)
"""

import math


def w_entropy(df, word):
    # Esto es porque se mambea con los tipos (creo)
    vec = df.loc[word][cant_palabras].tolist()
    
    return entropy(vec)

entropy([1] * 10000)

In [None]:
from scipy.stats import hypergeom

word = "anga"

n = df.loc[word].cantPalabra
N = df.cantPalabra.sum()
P = 23 # cantidad de provincias

hv = hypergeom(M=N, N=N/P, n=n)

shuffled_entropy = -P * sum([hv.pmf(m) * (float(m)/n) * np.log2(float(m)/n) for m in xrange(1, min(n, N  / P))])

print shuffled_entropy

In [None]:
import numpy as np

def expected_random_shuffle(df, word):
    """
    Apéndice último del paper de Zanette
    """
    n = df.loc[word].cantPalabra
    N = df.cantPalabra.sum()
    P = 23 # cantidad de provincias

    hv = hypergeom(M=N, N=N/P, n=n)

    return -P * sum([hv.pmf(m) * (float(m)/n) * np.log2(float(m)/n) for m in xrange(1, min(n, N  / P))])

    

def information_value(df, word):
    
    vec = df.loc[word][cant_palabras].tolist()
    n = sum(vec)
    
    freq = float(df.loc[word].cantPalabra) / (df.cantPalabra.sum())
    word_entropy = w_entropy(df, word)
    
    shuffled_entropy = expected_random_shuffle(df, word)

    #print(word)
    #print("entropía = {} entropía shuffle = {}".format(word_entropy, shuffled_entropy))

    res = np.log2(1+freq) * (shuffled_entropy - word_entropy) 
    
    return res
    

In [None]:
test_words = [
    "anga",
    "culiaw",
    "despues",
    "mitai",
    "artante",
    "q",
    "como",
    "ver",
    "de",
    "nah"]

for word in test_words:
    print word, information_value(df,word)

In [None]:
df["iv"] = map(lambda word: information_value(df, word), df.index)
df["entropy"] = map(lambda word: w_entropy(df, word), df.index)

In [None]:
df = df.sort(columns="entropy")

In [None]:
df[df.cantPalabra > 150][["entropy", "iv", "cantPalabra", "provinciaFnormMax"]].to_csv("prueba.csv")