In [None]:
from porter import stem
from collections import Counter
import numpy as np
import re
import heapq

doc = [
    "the new home has been saled on top forecasts",
    "the home sales rise in july",
    "there is an increase in home sales in july",
    "july encounter new a new home sales rise",
]

stopwords = ["the", "a", "an", "on", "behind", "under", "there", "in", "on"]

# from nltk import stopwords
# stopwords.words("english")



def traitement(document, stop_words=stopwords):
    temp = [stem(word.lower()) for word in document.split(" ") if word not in stop_words]
    return Counter(temp)

def preprocess(t):
    for word in re.split(r"\W+", t):
        word = stem(word.lower())
        if word not in stopwords:
            yield word

print(Counter(preprocess(doc[0])))

def tf(document):
    return Counter(document)

traitement(doc[0])


Counter({'new': 1,
         'home': 1,
         'ha': 1,
         'been': 1,
         'sale': 1,
         'top': 1,
         'forecast': 1})

In [None]:
def create_index(document, tfidf=False):
    index = {}
    for id, doc in enumerate(document):
        index[id] = traitement(doc)

    return index

index = create_index(doc)
index



{0: Counter({'new': 1,
          'home': 1,
          'ha': 1,
          'been': 1,
          'sale': 1,
          'top': 1,
          'forecast': 1}),
 1: Counter({'home': 1, 'sale': 1, 'rise': 1, 'juli': 1}),
 2: Counter({'is': 1, 'increas': 1, 'home': 1, 'sale': 1, 'juli': 1}),
 3: Counter({'juli': 1,
          'encount': 1,
          'new': 2,
          'home': 1,
          'sale': 1,
          'rise': 1})}

In [None]:

def df(index):
    pass
    

In [None]:
for k, v in index.items():
    for w in v.items():
        c = Counter()
        c.update({w: 1})



In [None]:
def vocabulaire(document, stop_words=stopwords):
    words = [stem(word.lower()) for doc in document for word in doc.split(" ") if word not in stop_words]
    return np.unique(words)


def create_index_inverse(document, tfidf=False):
    index = {}
    N = len(document)
    voc = vocabulaire(document)
    for w in voc:
        index[w] = {}
        for id, doc in enumerate(document):
            count = traitement(doc)[w]
            if count != 0:
                index[w].update({id: count})
        if tfidf:
            df = len(index[w])
            for k, v in index[w].items():
                index[w].update({k: v * np.log((1 + N) / (1 + df))})

    return index

create_index_inverse(doc, True)

# voir version sans passer par le vocabulaire ?? :thinking:

{'been': {0: 0.9162907318741551},
 'encount': {3: 0.9162907318741551},
 'forecast': {0: 0.9162907318741551},
 'ha': {0: 0.9162907318741551},
 'home': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 'increas': {2: 0.9162907318741551},
 'is': {2: 0.9162907318741551},
 'juli': {1: 0.22314355131420976,
  2: 0.22314355131420976,
  3: 0.22314355131420976},
 'new': {0: 0.5108256237659907, 3: 1.0216512475319814},
 'rise': {1: 0.5108256237659907, 3: 0.5108256237659907},
 'sale': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 'top': {0: 0.9162907318741551}}

In [None]:
def df(document, stop_words=stopwords):
    voc = vocabulaire(document)
    df = {w: 0 for w in voc}
    for w in voc:
        for doc in document:
            if w in traitement(doc):
                df[w] += 1
    return df

df(doc)

{'been': 1,
 'encount': 1,
 'forecast': 1,
 'ha': 1,
 'home': 4,
 'increas': 1,
 'is': 1,
 'juli': 3,
 'new': 2,
 'rise': 2,
 'sale': 4,
 'top': 1}

In [None]:
def tfidf(document, word):
    N = len(document)
    df_ = df(document)
    tfidf = {}
    for doc in document:
        tf = traitement(doc)
        for word in doc.split(" "):
            tf[word] * np.log((1 + N) / (1 + df))
    # idf = np.log(1 + N / 1 + df)

# pas ça

In [None]:
def taat(document, question, tfidf=False):
    q = question.split(" ")
    index = create_index_inverse(document, tfidf)
    index_tri = {k: sorted(v.items(), key=lambda x:x[1], reverse=True) for k, v in index.items()}
    taat = {}
    t = []
    for word, counter in index_tri.items():

        if word in q:
            
            for v in counter:
                #heapq.heappush(t, v)
                if v[0] in taat:
                    taat[v[0]] += v[1]
                else:
                    taat[v[0]] = v[1]
    # print(t)
    
    return sorted(taat.items(), key=lambda x:x[1], reverse=True)

taat(doc, "new home sales", True)

[(3, 1.0216512475319814), (0, 0.5108256237659907), (1, 0.0), (2, 0.0)]

In [None]:
t = [1, 2, 3, 4]
heapq.heapify(t)
t

[1, 2, 3, 4]

In [None]:
d = create_index_inverse(doc, True)
for k, v in d.items():
    print(sorted(v.items(), key=lambda x:x[1], reverse=True))
    

[(0, 0.9162907318741551)]
[(3, 0.9162907318741551)]
[(0, 0.9162907318741551)]
[(0, 0.9162907318741551)]
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0)]
[(2, 0.9162907318741551)]
[(2, 0.9162907318741551)]
[(1, 0.22314355131420976), (2, 0.22314355131420976), (3, 0.22314355131420976)]
[(3, 1.0216512475319814), (0, 0.5108256237659907)]
[(1, 0.5108256237659907), (3, 0.5108256237659907)]
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0)]
[(0, 0.9162907318741551)]


In [None]:
create_index(doc)

{0: Counter({'new': 1,
          'home': 1,
          'ha': 1,
          'been': 1,
          'sale': 1,
          'top': 1,
          'forecast': 1}),
 1: Counter({'home': 1, 'sale': 1, 'rise': 1, 'juli': 1}),
 2: Counter({'is': 1, 'increas': 1, 'home': 1, 'sale': 1, 'juli': 1}),
 3: Counter({'juli': 1,
          'encount': 1,
          'new': 2,
          'home': 1,
          'sale': 1,
          'rise': 1})}

In [None]:
def daat(document, question):
    q = question.split(" ")
    index = create_index_inverse(document)
    for word in q:
        index[word]
    pass

In [None]:
def add_if(ds, heap, k):
    """ heap suppose que les couples doc/score soient comparables
    plus simple d'avoir une classe qu'un couple et faire la comparaison
    sur le score
    """
    if len(heap) < k:
        heap.push(heap, ds)
    elif heap[0][1] < ds.score:
        heapq.heapreplace(heap, ds)

ID document (interne) <-> ID externe