In [1]:
import numpy as np
import re
import sys
from scipy.stats import chi2_contingency as chi
from nltk.corpus import stopwords

In [20]:
reader = lambda i : list(filter(bool, open(i).read().split('\n')))
file_in = reader('in')
words_list = reader('mark1/in1')

stop = set(stopwords.words('english'))
words_stopless = np.asarray(list(filter(lambda i: i not in stop, words_list)))

In [3]:
matrix = np.zeros((len(file_in), len(words_list)))

In [4]:
def filter_file(file_content):
    regex = re.compile('[^a-zA-Z]')
    file_content = file_content.replace('\r', ' ').lower()
    return regex.sub(' ', file_content)

In [5]:
def count(file_content, word):
    file_content = filter_file(file_content)
    return sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(' '+word+' '), file_content))

In [6]:
for i, file_name in enumerate(file_in):
    sys.stdout.flush()
    print (100*float(i)/len(file_in), '%', '\r', end='')
    
    for j, word in enumerate(words_list):
        file_content = open(str(file_name)).read()
        matrix[i][j] = count(file_content, word)

99.95316159250585 %   

In [7]:
matrix = np.asarray(matrix, dtype=np.float32)
norm_matrix = matrix / (matrix.sum(axis=0)+0.0001)

In [8]:
def return_similarity(ind1, ind2):
    vect1 = norm_matrix[:,ind1]
    vect2 = norm_matrix[:,ind2]
    inds = np.where(np.logical_or(vect1>0, vect2>0))
    
    return np.linalg.norm(vect1[inds]-vect2[inds])

In [21]:
def get_similar_to(search_word):
    vals = []
    search_ind = words_list.index(search_word)
    
    for word in words_stopless:
        value = return_similarity(search_ind, words_list.index(word))
        vals.append(value)
    
    vals = np.asarray(vals)
    inds = vals.argsort()

    return np.asarray(words_stopless)[inds[1:20]]

In [10]:
def get_implication(search_word, word):
    search_ind = words_list.index(search_word)
    search_priors = norm_matrix[:,search_ind] > 0
    p_b = np.sum(search_priors)
    
    value_ind = words_list.index(word)
    value_priors = norm_matrix[:,value_ind] > 0
        
    intersect = search_priors * value_priors
    p_ab = np.sum(intersect)  
    return float(p_ab)/p_b
        

In [22]:
def get_implications(search_word):
    search_ind = words_list.index(search_word)
    search_priors = norm_matrix[:,search_ind] > 0
    p_b = np.sum(search_priors)
    
    res = []
    
    for word in words_stopless:
        value_ind = words_list.index(word)
        value_priors = norm_matrix[:,value_ind] > 0
        
        #nk = np.sum(value_priors)
        #if (nk == 0):
        #    res.append(0)
        #    continue
        #value_priors = norm_matrix[:,value_ind] * np.log(len(file_in)/float(nk))

        intersect = search_priors * value_priors
        p_ab = np.sum(intersect)
        res.append(float(p_ab)/p_b)
        
    res = np.asarray(res)
    inds = res.argsort()[::-1]
    return np.asarray(words_stopless)[inds[1:120]]

In [16]:
return_similarity(words_list.index("alcohol"), words_list.index("dowry"))

0.29386249

In [23]:
get_similar_to('settled')

array(['parties', 'criminal', 'settlement', 'matter', 'dispute',
       'offences', 'abuse', 'proceedings', 'quashing', 'emerges',
       'arrived', 'honourable', 'process', 'justice', 'continuation',
       'quash', 'resolved', 'ends', 'prosecutor'], 
      dtype='<U17')

In [24]:
get_implication('gold', 'dowry')

0.73814898419864561

In [25]:
get_implications('dowry')

array(['court', 'case', 'counsel', 'high', 'learned', 'also', 'day',
       'filed', 'public', 'accused', 'police', 'justice', 'criminal',
       'made', 'passed', 'honourable', 'time', 'prosecutor', 'facts',
       'come', 'first', 'given', 'offences', 'crime', 'circumstances',
       'respondent', 'present', 'matter', 'held', 'petitioner',
       'complaint', 'judicial', 'order', 'view', 'complainant', 'petition',
       'law', 'parties', 'seen', 'magistrate', 'punishable', 'proceedings',
       'considered', 'basis', 'settled', 'light', 'nature', 'report',
       'note', 'husband', 'dispute', 'may', 'wife', 'submitted',
       'allegation', 'heard', 'state', 'station', 'abuse', 'well',
       'conviction', 'said', 'process', 'power', 'though', 'regard', 'fir',
       'quash', 'family', 'arrived', 'marriage', 'event', 'trial',
       'settlement', 'quashing', 'relevant', 'sought', 'duty', 'apex',
       'proceeding', 'interest', 'different', 'decision', 'prosecution',
       'reporte

In [None]:
def get_implications_temp(index_words = [], neg_words = [], search_words = words_stopless):
    search_priors = np.ones(len(file_in), dtype=np.bool)
    
    for index_word in index_words:
        ind = words_list.index(index_word)
        prior = norm_matrix[:, ind] > 0
        search_priors = search_priors * prior
    
    for neg_word in neg_words:
        ind = words_list.index(neg_word)
        prior = norm_matrix[:, ind] > 0
        search_priors = search_priors * (1 - prior)
    
    p_b = np.sum(search_priors)
    
    res = []
    final = search_priors[:]
    
    for word in search_words:
        value_ind = words_list.index(word)
        value_priors = norm_matrix[:,value_ind] > 0

        intersect = search_priors * value_priors
        final = final * value_priors
        p_ab = np.sum(intersect)
        res.append(float(p_ab)/p_b)
        
    res = np.asarray(res)
    inds = res.argsort()[::-1]
    
    if len(search_words) > 10: 
        return np.asarray(search_words)[inds[1:100]]
    else:
        return np.asarray(search_words)[inds], res[inds], float(np.sum(final))/p_b

In [None]:
get_implications_temp(['car'], ['dowry'])