# Data Mining Tests

In [1]:
import copy
import geopy
import geopy.distance
import pprint
import pymongo
import time
from lib.lda import LDAHelper

# Istantiate the helper object
lda_helper = LDAHelper(disableLogs=True)

# connect db
client = pymongo.MongoClient('localhost', 27017)

# get db
db = client['url-project']

# open collection
documents = db['documents']

## Function definitions

In [2]:
def move(start, direction, distance):
    """
    Returns a geopy.Point distant "distance"
    from the "start" point in the specified "direction"
    """
    orientation = 0
    start_point = copy.deepcopy(start)
    if direction == 'r':
        orientation = 90
    elif direction == 'u':
        orientation = 0
    elif direction == 'l':
        orientation = 360
    elif direction == 'b':
        orientation = 180
    return distance.destination(point=start_point, bearing=orientation)

def create_grid(bottom_left, upper_right, step):
    """
    Creates a grid of size step*step
    Each cells contains 4 geopy.Point which represent
    the 4 corners of each cells.
    """
    lat_bl, lon_bl = bottom_left
    lat_ur, lon_ur = upper_right
    bottom_left = geopy.Point(lat_bl, lon_bl)
    bottom_right = geopy.Point(lat_bl, lon_ur)
    upper_right = geopy.Point(lat_ur, lon_ur)
    upper_left = geopy.Point(lat_ur, lon_bl)

    # calculate cell size, the same if bottom_left and upper_right describe a square
    distance_horizontal = geopy.distance.vincenty(bottom_left, bottom_right).meters
    new_meters = distance_horizontal/step

    # vertical size cells
    dh = geopy.distance.VincentyDistance(meters=new_meters)

    distance_vertical = geopy.distance.vincenty(upper_left, bottom_left).meters
    new_meters = distance_vertical/step

    # horizontal size cells
    dv = geopy.distance.VincentyDistance(meters=new_meters)

    # define a matrix step*step
    # each cell has a list with 4 Points which represent the 4 corners
    matrix = [[0]*step for i in range(step)]

    tmp = move(upper_left, 'b', dv)
    matrix[0][0] = [upper_left, upper_right, tmp, move(tmp, 'r', dh)]

    # fill the first column
    for i in range(1, step):
        matrix[i][0] = [matrix[i-1][0][2], matrix[i-1][0][3],
                        move(matrix[i-1][0][2], 'b', dv), move(matrix[i-1][0][3], 'b', dv)]

    # fill all the other columns
    for i in range(0, step):
        for j in range(1, step):
            matrix[i][j] = [matrix[i][j-1][1], move(matrix[i][j-1][1], 'r', dh),
                            matrix[i][j-1][3], move(matrix[i][j-1][3], 'r', dh)]

    return matrix


def get_texts_from_db(mongo_collection, bottom_left_corner, upper_right_corner):
    """
    Return list of strings that represents text of documents into that area
    Input: mongo_collection, bottom_left_corner (lat, long), upper_right_corner (lat, long)
    """
    results = mongo_collection.find({
        'loc': {'$geoWithin': {'$box': [[bottom_left_corner[1], bottom_left_corner[0]],
                                        [upper_right_corner[1], upper_right_corner[0]]]}}
    })

    corpus = []
    for result in results:
        corpus.append(result['boilerpipe'])

    return corpus

## Creation of the minimum Grid
First of all, the basic grid it is created. We consider as the minium grid a **4x4** grid. To create the grid:
1. Get a geographic area, rect using bottom left and top right geopoints
2. Call the functions **create_grid** that create a matrix of dimension **4x4**, each cells contains the geographic coords of the identified area (rect using bottom left and top right geopoints)
3. For each cell, then retrieve from the database the documents in the corresponding geographic area.
4. For each cells, define the corpus (tokenized documents)
5. At the end, the object gensim.Dictionary is created.

We are ready to run the baseline approach and compute the lda model for each cells.

In [3]:
# Give as imput the bottom left corner and top right corner.
# Divides the geographic map in a grid 4x4
grid = create_grid((44.222765, 7.531128), (47.056277, 12.661743), 4)

all_texts = []

# iterate over each cells
for i in range(len(grid)):
    for j in range(len(grid)):
        
        # get the cell
        cell = grid[i][j]
        corners = copy.deepcopy(cell)

        # get texts in that area
        texts = get_texts_from_db(documents, corners[2], corners[1])

        # tokenized documents
        corpus = []
        for text in texts:
            corpus.append(lda_helper.clean(text))
        
        # merge tokenized documents to the set of all documents
        all_texts += corpus

        # define the object in the grid
        grid[i][j] = {'corners': corners, 'texts': corpus}
        
dictionary = lda_helper.create_dictionary(all_texts)

### Baseline
#### Compute the lda model for each cell

In [4]:
for i in range(len(grid)):
    for j in range(len(grid)):
        
        # get the cell
        cell = grid[i][j]
        
        # get tokenized documents
        texts = cell['texts']
        
        print("computing...")
        print(i, j, len(texts))
        
        # compute lda model
        lda_results = lda_helper.lda_topic(texts, dictionary=dictionary, dictionary_filters=False)
        
        # get lda results: model, topics and corpus (doc2bow)
        cell['lda_model'] = lda_results['lda_model']
        cell['topics'] = lda_results['topics']
        cell['corpus'] = lda_results['corpus']

computing...
0 0 127
computing...
0 1 136
computing...
0 2 126
computing...
0 3 36
computing...
1 0 163
computing...
1 1 602
computing...
1 2 153
computing...
1 3 354
computing...
2 0 924
computing...
2 1 1897
computing...
2 2 679
computing...
2 3 501
computing...
3 0 377
computing...
3 1 198
computing...
3 2 712
computing...
3 3 362


In [5]:
import pickle

with open('data/minimalgrid.pkl', 'wb') as f:
    pickle.dump(grid, f)

# load pre-computated minimal-grid 4x4
with open('data/minimalgrid.pkl', 'rb') as f:
    minimalgrid = pickle.load(f)

## Change Grid size

In [6]:
# minimalgrid 4x4 -> grid 2x2
preceed_grid = minimalgrid
preceed_dimension = 4
target_dimension = 2
new_cell_side = preceed_dimension / target_dimension
print(new_cell_side)

def getSquare(m, i, j, new_side):
    sq = []
    
    # upper left indexes
    i_up_index = int(i*new_side)
    j_up_index = int(j*new_side)
    new_side = int(new_side)
    
    for i in range(i_up_index, i_up_index + new_side):
        for j in range(j_up_index, j_up_index + new_side):
            print(i, j)
            sq.append(m[i][j])
        
    return sq

new_grid = [[0]*target_dimension for i in range(target_dimension)]
for i in range(target_dimension):
    for j in range(target_dimension):
        new_grid[i][j] = getSquare(preceed_grid, i, j, new_cell_side)

2.0
0 0
0 1
1 0
1 1
0 2
0 3
1 2
1 3
2 0
2 1
3 0
3 1
2 2
2 3
3 2
3 3


In [7]:
### recompute new_grid
def computeBaseline(grid):
    side = len(grid)
    
    for i in range(side):
        for j in range(side):
            parts = grid[i][j]
            texts = []
            for p in parts:
                texts += p["texts"]
            
            # compute lda model
            lda_results = lda_helper.lda_topic(texts, dictionary=dictionary, dictionary_filters=False)

            grid[i][j] = {'texts': texts}
            grid[i][j]['corpus'] = lda_results['corpus']
            grid[i][j]['lda_model'] = lda_results['lda_model']
            grid[i][j]['topics'] = lda_results['topics']
            

new_baseline_grid = copy.deepcopy(new_grid)
start_time = time.time()
computeBaseline(new_baseline_grid)
print("--- %s seconds ---" % (time.time() - start_time))

--- 222.67525029182434 seconds ---


In [8]:
### lda update new_grid
def pickTheBestOne(parts):
    max_len = 0
    best_idx = 0
    
    for i, p in enumerate(parts):
        current = len(p['texts'])
        if current > max_len:
            max_len = current
            best_idx = i
    
    best_part = parts.pop(best_idx)
    return best_part, parts

def computeLdaUpdate(grid):
    side = len(grid)
    for i in range(side):
        for j in range(side):
            parts = grid[i][j]
            best_part, rest = pickTheBestOne(parts)
            lda_model = best_part['lda_model']
            corpus = []
            texts = [best_part["texts"]]
            for p in rest:
                corpus += p["corpus"]
                texts += p["texts"]
            total_corpus = corpus + best_part['corpus']
            lda_results = lda_helper.lda_update_merge(lda_model, corpus, total_corpus)
            grid[i][j] = {'corpus': total_corpus, 'texts': texts}
            grid[i][j]['lda_model'] = lda_results['lda_model']
            grid[i][j]['topics'] = lda_results['topics']

grid_lda_update = copy.deepcopy(new_grid)
start_time = time.time()
computeLdaUpdate(grid_lda_update)
print("--- %s seconds ---" % (time.time() - start_time))

--- 80.52158284187317 seconds ---
