# RIW Project by Cécile Gontier and Delphine Shi
This notebook will demonstrate primary results from the project.

## Part 1 : Pre-processing the copus

### CACM
Calculate tokens and vocabulary size for CACM collection, the Heap law regression, and draw its frequency-rank graph.

In [None]:
from CACMIndex import *
from heapRegression import *

cacmindex = CACMIndex()
cacmindex.build()
cacmindex.get_size()

print()
#following data is the result of size number for full and half text.
CACM_tokens = np.array([188887, 85151])
CACM_vocab = np.array([9238, 6334])
heap = HeapRegression(CACM_tokens, CACM_vocab)
parameters = heap.calculate_regression()
print("The Heap law parameters are:")
print("(b, k) = {}".format(parameters))
print("For 1 million tokens there would be (by Heap law) {} vocabulary"
    .format(heap.calculate_vocab(1000000, parameters)))

graph = FrequencyRankGraph(cacmindex.get_freq())
graph.draw_graph()
graph.draw_log_graph()

### CS276
Calculate tokens and vocabulary size for CS276 collection, and draw its frequency-rank graph.
It will take around 10 minutes.

In [None]:
from CS276Index import *
from heapRegression import *

cs276index = CS276Index()
cs276index.build()
# cs276index.get_size()

print()
#following data is the result of size number for full and half text.
CS276_tokens = np.array([25527977,  12796571])
CS276_vocab = np.array([284418, 140665])
heap = HeapRegression(CS276_tokens, CS276_vocab)
parameters = heap.calculate_regression()
print("The Heap law parameters are:")
print("(b, k) = {}".format(parameters))
print("For 1 million tokens there would be (by Heap law) {} vocabulary"
    .format(heap.calculate_vocab(1000000, parameters)))

graph = FrequencyRankGraph(cs276index.get_freq())
graph.draw_graph()
graph.draw_log_graph()

## Part 2 : Inverted index
### CACM
Let's create CACM inverted index

In [None]:
from BSBIndex import *

CACMIndex = CACMIndex()
CACMIndex.build()
index = BSBIndex('CACM', CACMIndex.get_term_dict(), CACMIndex.get_document_dict())
index.build()
# print(index.get_index())

## Part 3 : Search
### Boolean search

In [None]:
from boolean.booleanEvaluation import *

# search for documents having words arithmetic and hardware in CACM collection
request_and = BooleanRequest(Operation.AND, "arithmetic", "hardware")  # 1258, 1409, 2175, 3131
model = BooleanEvaluation(request_and, "CACM")
res = model.search()
print(res)
model.display_results(res, len(res))


# here do it with CS276

### Vectorial search

In [None]:
from vectorial.vectorialEvaluation import *

# in CACM collection
request = "arithmetic hardware"
model = VectorialEvaluation(request, "CACM")
# results, total = model.search(NaturalWeighting())
# results, total = model.search(TfIdfWeighting())
results, total = model.search(NormalizedTfIdfWeighting(), "jaccard")
# print(results)
model.display_results(results, total)

# here do it with CS276

## Part 4 : CACM Evaluation

In [None]:
# here include performance and graphs

## Part 4 : CS276 Variable Byte