# Import Libraries

In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
#import mapbox_vector_tile
from time import time
import operator
from collections import Counter
import json
import os
import math
import pickle
import random
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from numpy import argmax
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, transforms
from torch.utils.data import Dataset

from shapely import geometry 
from PIL import Image, ImageDraw
from simplification.cutil import (
    simplify_coords,
    simplify_coords_idx,
    simplify_coords_vw,
    simplify_coords_vw_idx,
    simplify_coords_vwp,
)

# Define Functions

In [190]:
def create_connection(db_file):
    """ create a database connection to a SQLite database """
    try:
        conn = sqlite3.connect(db_file)
        print(conn)
    except Error as e:
        print(e)
    
    return conn

def PolyArea(x,y):
    return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))

def ScoreFormula(old_number_of_datapoints, new_number_of_datapoints, processing_time):
    return (1 - (new_number_of_datapoints / old_number_of_datapoints)) * (1 - processing_time)


def ScaleFactor(all_geometries):
    b_list = []
    
    for geometries in all_geometries:
        
        polygon = geometry.Polygon(geometries)
        centroid = np.array(polygon.centroid)
        coordinates = np.vstack(geometries)
        
        b = coordinates - centroid
        b_min = np.min(b)
        b_max = np.max(b)
        b_list.append(b_min)
        b_list.append(b_max)
        
    return np.std(b_list)
    
def Normalize_Geometry(coordinates1, scale_factor):
    polygon = geometry.Polygon(coordinates1)
    centroid = np.array(polygon.centroid)
    coordinates2 = np.vstack(coordinates1)
    
    return (coordinates2 - centroid) / scale_factor

def Add_One_Hot(normalized_geometry):
    normalized_geometry = np.insert(normalized_geometry, 2, 1, axis=1)
    normalized_geometry = np.insert(normalized_geometry, 3, 0, axis=1)
    normalized_geometry = np.insert(normalized_geometry, 4, 0, axis=1)
    normalized_geometry[len(normalized_geometry)-1,2] = 0
    normalized_geometry[len(normalized_geometry)-1,4] = 1
    
    return normalized_geometry

def Add_Zero_Padding(one_hotted_geometry, max_length):
    boundary = max_length - len(one_hotted_geometry)
    zero_matrix = np.zeros([boundary,len(one_hotted_geometry[0])])
    return np.append(one_hotted_geometry, zero_matrix, axis=0)

def moment(xy, p, q):
    xy = np.asarray(xy)
    x = xy[:, 0]
    y = xy[:, 1]
    x = (x**p) * (x != 0)
    y = (y**q) * (y != 0)
    M = (x * y).sum(-1)
    return torch.tensor(M)

def c_mass(xy):
    xy = np.asarray(xy)
    mass = moment(xy, 0, 0)
    mx = moment(xy, 1, 0) / mass
    my = moment(xy, 0, 1) / mass
    return [mx,my]

def mu(xy, p, q):
    xy = np.asarray(xy)
    m = c_mass(xy)
    x = xy[:, 0]
    y = xy[:, 1]
    x = ((x - m[0])**p) * (x != 0)
    y = ((y - m[1])**q) * (y != 0)
    M = (x * y).sum(-1)
    return M

def scale_factor_calculation(xy):
    mu_list = [mu(i,0,0) for i in xy]
    return sum(mu_list) / len(mu_list)
        

def scale_factor_apply(xy):
    xy = torch.Tensor(xy)
    return torch.sqrt((moment(xy,2,0) + moment(xy, 0, 2))/10000000000)

def canonical_transformation(xy):
#   translation
    xy = torch.Tensor(xy)
    m = torch.Tensor(c_mass(xy))
    x = xy - m.view(1, 2) * (xy[:, 0] != 0).view(-1, 1)

#   scale
    scale = scale_factor_apply(xy)
    x = x / scale
    
    
#   rotation
    m_20 = moment(x, 2, 0)
    m_02 = moment(x, 0, 2)
    m_11 = moment(x, 1, 1)
    
    angle = np.arctan2(2 * m_11, m_20 - m_02) / 2.0
#     return angle
    if angle < 0:
        angle = np.pi + angle # this is a bad solution
        # we need to analyze m_30, m_21, m_12, m_03 to check for flip symmetry
    #print(angle*180/np.pi)
    
    M = torch.Tensor([
        [np.cos(angle), np.sin(angle)],
        [np.sin(-angle), np.cos(angle)]
    ])
    x = (M @ x.T).T[None]
    
    
    return x

def getAngle(a, b, c):
    ang = math.degrees(math.atan2(c[1]-b[1], c[0]-b[0]) - math.atan2(a[1]-b[1], a[0]-b[0]))
    return ang + 360 if ang < 0 else ang

def polygon_properties(xy):
    length = geometry.Polygon(xy).length
    points = len(xy)
    
    b=1
    points_distance = []
    for coord in xy[:-2]:
        points_distance.append(geometry.LineString([coord,xy[b]]).length)
        b += 1 
    points_distance = pd.DataFrame(points_distance)
        
    b=1
    c=2
    angles = []
    for coord in xy[:-3]:
        angles.append(getAngle(coord, xy[b], xy[c]))
        b+=1
        c+=1
    angles = pd.DataFrame(angles)
    
    # [number of points, length, average PD, std PD, min PD, max PD, average angle, std angle, min angle, max angle]
    return [points, length, points_distance.describe()[0][1], points_distance.describe()[0][2],
           points_distance.describe()[0][3], points_distance.describe()[0][7], angles.describe()[0][1], 
            angles.describe()[0][2], angles.describe()[0][3], angles.describe()[0][7]]

ScoreFormula(50,25,0.2)

0.4

In [191]:
def CreateGrid(poly, dx, dy):
    
    x_ls = []
    y_ls = []

    for a in poly:
        x_ls.append(a[0])
    for a in poly:
        y_ls.append(a[1])
        
    minx = min(x_ls)
    maxx = max(x_ls)
    miny = min(y_ls)
    maxy = max(y_ls)

    nx = int(math.ceil(abs(maxx - minx)/dx))
    ny = int(math.ceil(abs(maxy - miny)/dy))

    grid = []       
    for i in range(ny):   
        grid.append(geometry.LineString([[minx,max(maxy-dy*i,miny)], [maxx, max(maxy-dy*i,miny)]]))

    for j in range(nx):
        grid.append(geometry.LineString([[min(minx+dx*j,maxx), maxy], [min(minx+dx*j,maxx), miny]]))
    
    return grid
    
def CheckSameIntersections(poly, simplified_coords, grid, ROUNDING):
    
    original = geometry.Polygon(poly)
    simplified = geometry.Polygon(simplified_coords)

    o_ls = []
    s_ls = []
    for line in grid:
        x = original.intersection(line)
        y = simplified.intersection(line)
        if x:
            if x.geom_type == 'Point':
                o_ls.append(hash(tuple([round(x.coords[0][0],ROUNDING), round(x.coords[0][1],ROUNDING)])))
            if x.geom_type == 'LineString':
                for xy in x.coords:
                    o_ls.append(hash(tuple([round(xy[0],ROUNDING), round(xy[1],ROUNDING)])))
    
        if y:
            if y.geom_type == 'Point':
                s_ls.append(hash(tuple([round(y.coords[0][0],ROUNDING), round(y.coords[0][1],ROUNDING)])))
            if y.geom_type == 'LineString':
                for xy in y.coords:
                    s_ls.append(hash(tuple([round(xy[0],ROUNDING), round(xy[1],ROUNDING)])))
        
    return len(list(set(o_ls).intersection(s_ls))) / len(set(o_ls))

    
def alter_by_zoom(poly, zoom):

    mpp = {
    '0' : 156543,
    '1' : 78271.5,
    '2' : 39135.8,
    '3' : 19567.88,
    '4' : 9783.94,
    '5' : 4891.97,
    '6' : 2445.98,
    '7' : 1222.99,
    '8' : 611.5,
    '9' : 305.75,
    '10' : 152.87,
    '11' : 76.44,
    '12' : 38.219,
    '13' : 19.109,
    '14' : 9.555,
    '15' : 4.777,
    '16' : 2.3887,
    '17' : 1.1943,
    '18' : 0.5972,
    '19' : 0.2986,
    '20' : 0.14929,
    '21' : 0.074646,
    '22' : 0.037323
    }
    return (np.array(poly) / mpp[str(zoom)]).tolist()


def check_pixel_similarity(original_coords, simplified_coords, zoom):
    
    poly1 = alter_by_zoom(original_coords, zoom)
    poly2 = alter_by_zoom(simplified_coords, zoom)

    x = []
    y = []
    for a in poly1:
        x.append(a[0])
        y.append(a[1])
    
    for a in poly1:
        a[0] = a[0] - min(x)
        a[1] = a[1] - min(y)
    
    for a in poly2:
        a[0] = a[0] - min(x)
        a[1] = a[1] - min(y)
    
    width = int(max(x) - min(x))
    height = int(max(y) - min(y))

    poly1 = [tuple(x) for x in poly1]
    poly2 = [tuple(x) for x in poly2]

    img1 = Image.new('L', (width, height), 0)
    ImageDraw.Draw(img1).polygon(poly1, outline=1, fill=0)
    mask1 = np.array(img1)
    
    img2 = Image.new('L', (width, height), 0)
    ImageDraw.Draw(img2).polygon(poly2, outline=1, fill=0)
    mask2 = np.array(img2)
    
    return np.sum(mask1 == mask2) / (width*height)
    #return mask1

# Load Data

In [192]:
path = '/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Data/Sample_data_03_05/'
Polygons = []
Types = []

for filename in os.listdir(path):
    if "geometrie." in filename:
        print(filename)
        
        f = open(str(path + filename))
        jsondata = json.load(f)
        
        

        for a in jsondata['features']:
            if len(a['geometry']['coordinates']) == 1:
                Polygons.append(a['geometry']['coordinates'][0])
                Types.append(a['geometry']['type'])
            if a['geometry']['type'] == 'LineString':
                Polygons.append(a['geometry']['coordinates'])
                Types.append(a['geometry']['type'])
            else:
                for b in a['geometry']['coordinates']:
                    Polygons.append(b)
                    Types.append(a['geometry']['type'])
            
geometry_df = pd.DataFrame({'geometry':Polygons,
                            'type':Types})
    
    
    

#f = open('/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Data/Sample_data_03_05/spoor_export_buitengebied_geometrie.json')
#wegdeeljson = json.load(f)
#wegdeeljson




waterdeel_export_stedelijk_geometrie.json
wegdeel_export_buitengebied_geometrie.json
bag_pand_buitengebeid_export_geometrie.json
wegdeel_export_stedelijk_geometrie.json
spoor_export_stedelijk_geometrie.json
waterdeel_export_buitengebied_geometrie.json
bag_pand_stedelijk_export_geometrie.json
spoor_export_buitengebied_geometrie.json


# Parameters

In [195]:
# Simplification Possibilities
simplify_possibilities = [['D-P', 0], ['D-P', 0.5], ['D-P', 0.1], ['D-P', 0.05], ['D-P', 0.01], ['D-P', 0.005], 
                          ['D-P', 0.001], ['V-W', 0.5], ['V-W', 0.1], ['V-W', 0.05], ['V-W', 0.01], 
                          ['V-W', 0.005]]

#simplify_possibilities = [['D-P', 0], ['D-P', 0.5], ['D-P', 0.1], ['D-P', 0.05], ['D-P', 0.01], ['D-P', 0.005], 
#                          ['D-P', 0.001], ['V-W', 0.5], ['V-W', 0.1], ['V-W', 0.05], ['V-W', 0.01], 
#                          ['V-W', 0.005], ['V-W', 0.001], ['V-W', 0.0005], ['V-W', 0.0001], ['V-W', 0.00005]]

# Polygon length evaluation
MAX_LENGTH_DEFICIT = -0.1

# Grid
dx = 1
dy = 1
ROUNDING = 1

MIN_INTERSECTIONS_PERC = 0.75

len(simplify_possibilities)

12

# Pre Processing

In [196]:
Polygons = list(geometry_df['geometry'][geometry_df['type'] == 'Polygon'])
Lines = list(geometry_df['geometry'][geometry_df['type'] == 'LineString'])

Polygons_list = []
for element in Polygons:
    if len(element) < 100:
        Polygons_list.append(element)
Polygons = Polygons_list
len(Polygons)
#len(Lines)

294580

In [198]:
results_list = []
length_list = []
Polygons_sample = random.sample(Polygons, 250000)
#scale_factor = scale_factor_calculation(Polygons_sample)
print("Scale Factor done")


# Decide order from longest polygon to smallest polygon
for row in Polygons_sample:

    length_list.append([row, len(row)])

length_list.sort(key=operator.itemgetter(1), reverse=True)
print("Sorted the Polygons")
a=0

y_ls = []
    
for element in length_list:
    print(str(a) + " / " + str(len(length_list)), end="\r")
    a = a + 1
    results_dict = {}
    poly1 = geometry.Polygon(element[0])
    results = []
    process_time_tensor = torch.zeros(len(simplify_possibilities)+1)
    datasize_tensor = torch.zeros(len(simplify_possibilities)+1)
    variance_penalty_tensor = torch.ones(len(simplify_possibilities)+1)
    
    i=0
    for possibility in simplify_possibilities:
        

        if possibility[0] == 'D-P':
            # Simplification function Douglas-Peucker
            time_start = time()
            simplified_coordinates = simplify_coords(element[0], possibility[1])
            time_end = time()
            process_time = time_end - time_start

        if possibility[0] == 'V-W':
            # Simplification function Visvalingam-Whyatt
            time_start = time()
            simplified_coordinates = simplify_coords_vw(element[0], possibility[1])
            time_end = time()
            process_time = time_end - time_start
            
        process_time_tensor[i] = torch.tensor(process_time * 1000)
        datasize_tensor[i] = torch.tensor(len(simplified_coordinates) / len(element[0]))
        
        
        if len(simplified_coordinates) >= 3:
            poly2 = geometry.Polygon(simplified_coordinates)
            
            if np.isnan(check_pixel_similarity(element[0], simplified_coordinates, 17)) == True:
                results.append('Remove')
                variance_penalty_tensor[len(simplify_possibilities)] = torch.tensor(0)
                
                
            if check_pixel_similarity(element[0], simplified_coordinates, 17) == 1:
                score = ScoreFormula(len(element[0]), len(simplified_coordinates), process_time)
                #results.append(score)
                dicti = {"i": i, "score": score}
                results.append(dicti)
                variance_penalty_tensor[i] = torch.tensor(0)
        
        
        i = i + 1
    y_tensor = torch.Tensor(process_time_tensor * datasize_tensor + variance_penalty_tensor)
    y_ls.append(y_tensor)
    
    #results_dict['polygon'] = Add_Zero_Padding(element[0], len(length_list[0][0]))
    results_dict['polygon'] = Add_Zero_Padding(canonical_transformation(element[0])[0], len(length_list[0][0]))
    results_dict['properties'] = polygon_properties(element[0])
    
    if results[0] == 'Remove':
        results_dict['algorithm_top1'] = len(simplify_possibilities)
        
    else:
        if len(results) >= 1:
        
            results_df = pd.DataFrame(results).sort_values('score', ascending = False)
            results_dict['algorithm_top1'] = results_df['i'].iloc[0]
        
        if len(results) >= 3:
            results_df = pd.DataFrame(results).sort_values('score', ascending = False)
            results_dict['algorithm_top3'] = list(results_df['i'][0:3])
        
        if len(results) >= 5:
            results_df = pd.DataFrame(results).sort_values('score', ascending = False)
            results_dict['algorithm_top5'] = list(results_df['i'][0:5])
    
    results_dict['algorithm_all'] = results_df['i']

    results_dict['original_geom'] = element[0]
        
    results_list.append(results_dict)

Scale Factor done
Sorted the Polygons
6189 / 250000



249999 / 250000

# Save Results

In [199]:
# Saving Labels and Normalized Data
#pickle.dump( results_list, open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/results_list_NoNorm.p", "wb" ) )

#pickle.dump( results_list, open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/results_list.p", "wb" ) )
#pickle.dump( results_list, open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/results_list2.p", "wb" ) )
pickle.dump( results_list, open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/results_list3.p", "wb" ) )

# Data Balancing

In [200]:
results_list = pickle.load( open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/results_list3.p", "rb" ) )

results_df = pd.DataFrame(results_list)

results_df['algorithm_top1'].value_counts()

0     165731
8      17352
7      16539
10     11593
9       8881
1       8774
11      7056
2       4226
3       3292
4       2352
6       1859
5       1537
12       808
Name: algorithm_top1, dtype: int64

In [201]:
for index, row in results_df.iterrows():
    if isinstance(row['algorithm_top5'], list) == False:
        if isinstance(row['algorithm_top3'], list) == False:
            results_df.at[index,'algorithm_top5'] = row['algorithm_top1']
        else:
            results_df.at[index,'algorithm_top5'] = row['algorithm_top3']
            
    if isinstance(row['algorithm_top3'], list) == False:
        results_df.at[index,'algorithm_top3'] = row['algorithm_top1']
    
    results_df.at[index, 'algorithm_all'] = list(results_df['algorithm_all'][index])
    
results_df
    

Unnamed: 0,polygon,properties,algorithm_top1,algorithm_top3,algorithm_all,original_geom,algorithm_top5
0,"[[-0.0873810425400734, 0.06564639508724213], [...","[99, 615.3124009639364, 6.2862951513134515, 11...",11,"[11, 6, 0]","[11, 6, 0]","[[569212.006392023, 6817744.41890897], [569221...","[11, 6, 0]"
1,"[[0.0726815015077591, 0.030753642320632935], [...","[99, 621.8473441812531, 6.390577741162117, 8.7...",0,0,[0],"[[567001.947009817, 6813423.48716989], [567003...",0
2,"[[-0.008565051481127739, -0.006880257744342089...","[99, 407.7399565199074, 4.178463394979179, 11....",0,0,[0],"[[571569.725229183, 6814491.43055043], [571568...",0
3,"[[-0.14224334061145782, 0.013581816107034683],...","[99, 1368.3399200611218, 13.9689776248775, 17....",0,0,[0],"[[511604.226279186, 6793164.49240385], [511606...",0
4,"[[0.0023038601502776146, 0.00545631954446435],...","[99, 587.1522959443573, 5.615763266535105, 4.7...",0,0,[0],"[[512670.53379004, 6788978.16589243], [512670....",0
...,...,...,...,...,...,...,...
249995,"[[-0.014310438185930252, 0.0030228430405259132...","[4, 11.52625317168625, 3.343070463600278, 0.09...",0,"[0, 1, 2]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[[567728.230127489, 6816773.19594685], [567729...","[0, 1, 2, 3, 4]"
249996,"[[0.010053538717329502, 0.004469005391001701],...","[4, 11.523363234321945, 4.08658696919464, 1.02...",0,"[0, 1, 2]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[[567680.278299499, 6816931.16778507], [567681...","[0, 1, 2, 3, 4]"
249997,"[[-0.010315812192857265, -0.004728611558675766...","[4, 11.721705044877638, 4.141911081705699, 1.0...",0,"[0, 1, 2]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[[567760.824596053, 6816701.11418839], [567759...","[0, 1, 2, 3, 4]"
249998,"[[0.0018060868605971336, -0.01866050437092781]...","[4, 26.49451881314417, 9.208097229569486, 2.39...",0,"[0, 1, 2]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[[592573.660072807, 6742824.04699339], [592566...","[0, 1, 2, 3, 4]"


In [202]:
df_majority = results_df[results_df.algorithm_top1 == 0]
df_minority = results_df[results_df.algorithm_top1 != 0]

df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=5000,
                                   #n_samples=500,
                                  random_state=123)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_majority = df_downsampled[df_downsampled.algorithm_top1 == 7]
df_minority = df_downsampled[df_downsampled.algorithm_top1 != 7]

df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=5000,
                                   #n_samples=500,
                                  random_state=123)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_majority = df_downsampled[df_downsampled.algorithm_top1 == 8]
df_minority = df_downsampled[df_downsampled.algorithm_top1 != 8]

df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=5000,
                                   #n_samples=500,
                                  random_state=123)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_majority = df_downsampled[df_downsampled.algorithm_top1 == 10]
df_minority = df_downsampled[df_downsampled.algorithm_top1 != 10]

df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=5000,
                                   #n_samples=500,
                                  random_state=123)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_majority = df_downsampled[df_downsampled.algorithm_top1 == 9]
df_minority = df_downsampled[df_downsampled.algorithm_top1 != 9]

df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=5000,
                                  # n_samples=500,
                                  random_state=123)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_majority = df_downsampled[df_downsampled.algorithm_top1 == 1]
df_minority = df_downsampled[df_downsampled.algorithm_top1 != 1]

df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=5000,
                                  # n_samples=500,
                                  random_state=123)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

results_list = df_downsampled.to_dict('records')

df_downsampled['algorithm_top1'].value_counts()

11    7056
10    5000
9     5000
8     5000
7     5000
1     5000
0     5000
2     4226
3     3292
4     2352
6     1859
5     1537
12     808
Name: algorithm_top1, dtype: int64

In [203]:
def make_one_hot(y,length):
    output = []
    for i in y:
        array = np.zeros(length)
        if isinstance(i, int) == False:
            for j in i:
                array[j] = 1
        else:
            array[i] = 1
                
        output.append(array)
    return np.array(output)

def make_one_one_hot(y,length):
    array = np.zeros(length)
    array[y] = 1
    return array

## Data processing top1

In [57]:
X = []
y = []
for element in results_list:
    
    X.append(element['polygon'])
    y.append(element['algorithm_top1'])
X = np.array(X)
y = np.array(y)
y = np.nan_to_num(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

y_onehottrain = to_categorical(y_train)
train_tensor = []
b=0
for a in X_train:
    d = y_onehottrain[b]
    c = y_train[b]
    train_tensor.append([a,c,d]) 
    b = b+1

y_onehottest = to_categorical(y_test)
test_tensor = []
b=0
for a in X_test:
    d = y_onehottest[b]
    c = y_test[b]
    test_tensor.append([a,c,d])
    b = b+1

## Data processing top3

In [67]:
X = []
y = []
for element in results_list:
    
    X.append(element['polygon'])
    y.append(element['algorithm_top3'])
X = np.array(X)
y = np.array(y)
y = np.nan_to_num(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

y_onehottrain = make_one_hot(y_train, 13)
train_tensor = []
b=0
for a in X_train:
    d = y_onehottrain[b]
    if isinstance(y_train[b], list) == True:
        c = y_train[b][0]
    else:
        c = y_train[b]
    train_tensor.append([a,c,d]) 
    b = b+1

y_onehottest = make_one_hot(y_test, 13)
test_tensor = []
b=0
for a in X_test:
    d = y_onehottest[b]
    if isinstance(y_test[b], list) == True:
        c = y_test[b][0]
    else:
        c = y_test[b]
    test_tensor.append([a,c,d])
    b = b+1

## Data processing top5

In [73]:
X = []
y = []
for element in results_list:
    
    X.append(element['polygon'])
    y.append(element['algorithm_top5'])
X = np.array(X)
y = np.array(y)
y = np.nan_to_num(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

y_onehottrain = make_one_hot(y_train, 13)
train_tensor = []
b=0
for a in X_train:
    d = y_onehottrain[b]
    if isinstance(y_train[b], list) == True:
        c = y_train[b][0]
    else:
        c = y_train[b]
    train_tensor.append([a,c,d]) 
    b = b+1

y_onehottest = make_one_hot(y_test, 13)
test_tensor = []
b=0
for a in X_test:
    d = y_onehottest[b]
    if isinstance(y_test[b], list) == True:
        c = y_test[b][0]
    else:
        c = y_test[b]
    test_tensor.append([a,c,d])
    b = b+1

## Data processing Approach 3

In [51]:
pre_X = []

for element in results_list:
    pre_X.append(element['properties'])
pre_X = np.array(pre_X)
pre_X = np.nan_to_num(pre_X)   

a=0
for a in range(pre_X.shape[1]):
    pre_X[:,a] = (pre_X[:,a] - pre_X[:,a].mean()) / pre_X[:,a].std()


X = []
y = []
a=0
for element in results_list:
    
    X.append([element['polygon'], pre_X[a]])
    y.append(element['algorithm_top5'])
    a+=1
X = np.array(X)
X = np.nan_to_num(X)
y = np.array(y)
y = np.nan_to_num(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

y_onehottrain = make_one_hot(y_train, 13)
train_tensor = []
b=0
for a in X_train:
    d = y_onehottrain[b]
    if isinstance(y_train[b], list) == True:
        c = y_train[b][0]
    else:
        c = y_train[b]
    train_tensor.append([a[0], a[1],c,d]) 
    b = b+1

y_onehottest = make_one_hot(y_test, 13)
test_tensor = []
b=0
for a in X_test:
    d = y_onehottest[b]
    if isinstance(y_test[b], list) == True:
        c = y_test[b][0]
    else:
        c = y_test[b]
    test_tensor.append([a[0],a[1],c,d])
    b = b+1

## Check percentage of simplifications that exceeds visual variance condition

In [204]:
pre_X = []

for element in results_list:
    pre_X.append(element['properties'])
pre_X = np.array(pre_X)
pre_X = np.nan_to_num(pre_X)   

a=0
for a in range(pre_X.shape[1]):
    pre_X[:,a] = (pre_X[:,a] - pre_X[:,a].mean()) / pre_X[:,a].std()


X = []
y = []
a=0
for element in results_list:
    
    X.append([element['polygon'], pre_X[a], element['original_geom']])
    y.append(element['algorithm_all'])
    a+=1
X = np.array(X)
X = np.nan_to_num(X)
y = np.array(y)
y = np.nan_to_num(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

y_onehottrain = make_one_hot(y_train, 13)
train_tensor = []
b=0
for a in X_train:
    d = y_onehottrain[b]
    if isinstance(y_train[b], list) == True:
        c = y_train[b][0]
    else:
        c = y_train[b]
    train_tensor.append([a[0], a[1], a[2], c,d]) 
    b = b+1

y_onehottest = make_one_hot(y_test, 13)
test_tensor = []
b=0
for a in X_test:
    d = y_onehottest[b]
    if isinstance(y_test[b], list) == True:
        c = y_test[b][0]
    else:
        c = y_test[b]
    test_tensor.append([a[0],a[1], a[2],c,d])
    b = b+1

# Data Stats

In [None]:
pd.DataFrame({'keys':list(Counter(y_test).keys()),
              'freq':list(Counter(y_test).values())})

# PyTorch

In [74]:
class MNIST_Polygon(Dataset):
    
    def __init__(self, tensor, transform=None):
        data = [x for x, y, z in tensor]
        data = np.asarray(data)
        self.data = torch.reshape(torch.from_numpy(data).float(), (data.shape[0], data.shape[1]*data.shape[2]))
        self.targets = [torch.tensor(y).long() for x, y, z in tensor]
        self.onehot = [z for x, y, z in tensor]
                
    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y
    
    def __len__(self):
        return len(self.data)
    
MNIST_Polygon(train_tensor).data.shape

torch.Size([46693, 198])

In [75]:
BATCH_SIZE = 64
TEST_BATCH_SIZE = 64
EPOCHS = 50
LR = 0.002
LOG_INTERVAL = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(198, 256, bias=False),
            #nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.Linear(256, 128, bias=False), 
            #nn.BatchNorm1d(128),
            nn.ReLU(True),
            nn.Linear(128, 13),
            nn.Softmax(1)
        )

    def forward(self, x):
        #x = self.conv(x)
        #x = torch.flatten(x, 1)
        return self.net(x)

In [76]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()), end="\r")


def test(model, device, test_loader, results_list, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    a = 0
    real = dataset2.onehot
    criterion = nn.CrossEntropyLoss(reduction='sum')
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            #correct += pred.eq(target.view_as(pred)).sum().item()
            for i in pred:
                correct += real[a][i]
                a+=1
            

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    results_list.append([epoch, correct / len(test_loader.dataset), test_loss])

In [77]:
dataset1 = MNIST_Polygon(train_tensor)
dataset2 = MNIST_Polygon(test_tensor)
train_loader = torch.utils.data.DataLoader(dataset1,batch_size=BATCH_SIZE, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=TEST_BATCH_SIZE)

model = Net().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)
results_list = []

for epoch in range(1, EPOCHS + 1):
    train(model, DEVICE, train_loader, optimizer, epoch)
    test(model, DEVICE, test_loader, results_list, epoch)
    
        

Test set: Average loss: 2.4779, Accuracy: 2387.0/5189 (46.0%)

Test set: Average loss: 2.4487, Accuracy: 2471.0/5189 (47.6%)

Test set: Average loss: 2.4452, Accuracy: 2492.0/5189 (48.0%)

Test set: Average loss: 2.4428, Accuracy: 2485.0/5189 (47.9%)

Test set: Average loss: 2.4386, Accuracy: 2477.0/5189 (47.7%)

Test set: Average loss: 2.4391, Accuracy: 2493.0/5189 (48.0%)

Test set: Average loss: 2.4336, Accuracy: 2502.0/5189 (48.2%)

Test set: Average loss: 2.4430, Accuracy: 2428.0/5189 (46.8%)

Test set: Average loss: 2.4321, Accuracy: 2489.0/5189 (48.0%)

Test set: Average loss: 2.4374, Accuracy: 2515.0/5189 (48.5%)

Test set: Average loss: 2.4310, Accuracy: 2514.0/5189 (48.4%)

Test set: Average loss: 2.4302, Accuracy: 2555.0/5189 (49.2%)

Test set: Average loss: 2.4313, Accuracy: 2556.0/5189 (49.3%)

Test set: Average loss: 2.4290, Accuracy: 2556.0/5189 (49.3%)

Test set: Average loss: 2.4354, Accuracy: 2559.0/5189 (49.3%)

Test set: Average loss: 2.4287, Accuracy: 2536.0/5189 (

In [None]:
results_df = pd.DataFrame(results_list)
results_df.columns = ['epoch', 'accuracy', 'loss']
results_df

In [95]:
X = []
y = []
a = 0
for element in results_list:
    
    X.append(element['properties'])
    y.append(element['algorithm_top5'])
X = np.array(X)
y = np.array(y)
X = np.nan_to_num(X)
y = np.nan_to_num(y)

a=0
#for a in range(X.shape[1]):
#    X[:,a] = (X[:,a] - X[:,a].mean()) / X[:,a].std()
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

y_onehottrain = make_one_hot(y_train, 13)
#y_onehottrain = to_categorical(y_train)
train_tensor = []
b=0
for a in X_train:
    d = y_onehottrain[b]
    if isinstance(y_train[b], list) == True:
        c = y_train[b][0]
    else:
        c = y_train[b]
    train_tensor.append([a,c,d]) 
    b = b+1

y_onehottest = make_one_hot(y_test, 13)
#y_onehottest = to_categorical(y_test)
test_tensor = []
b=0
for a in X_test:
    d = y_onehottest[b]
    if isinstance(y_test[b], list) == True:
        c = y_test[b][0]
    else:
        c = y_test[b]
    test_tensor.append([a,c,d])
    b = b+1

(51882, 10)


In [96]:
class MNIST_Polygon2(Dataset):
    
    def __init__(self, tensor, transform=None):
        data = [x for x, y, z in tensor]
        self.data = torch.Tensor(data).float()
        #self.data = torch.reshape(torch.from_numpy(data).float(), (data.shape[0], data.shape[1]*data.shape[2]))
        self.targets = [torch.tensor(y).long() for x, y, z in tensor]
        self.onehot = [z for x, y, z in tensor]
                
    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y
    
    def __len__(self):
        return len(self.data)
    
MNIST_Polygon2(train_tensor).data.shape

torch.Size([46693, 10])

In [97]:
BATCH_SIZE = 64
TEST_BATCH_SIZE = 64
EPOCHS = 50
LR = 0.002
LOG_INTERVAL = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(10, 64, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Linear(64, 32, bias=False), 
            nn.BatchNorm1d(32),
            nn.ReLU(True),
            nn.Linear(32, 13),
            nn.Softmax(1)
        )

    def forward(self, x):
        #x = self.conv(x)
        #x = torch.flatten(x, 1)
        return self.net(x)

In [98]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()), end="\r")


def test(model, device, test_loader, results_list, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    a = 0
    real = dataset2.onehot
    criterion = nn.CrossEntropyLoss(reduction='sum')
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            #correct += pred.eq(target.view_as(pred)).sum().item()
            for i in pred:
                correct += real[a][i]
                a+=1
            

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    results_list.append([epoch, correct / len(test_loader.dataset), test_loss])

In [99]:
dataset1 = MNIST_Polygon2(train_tensor)
dataset2 = MNIST_Polygon2(test_tensor)
train_loader = torch.utils.data.DataLoader(dataset1,batch_size=BATCH_SIZE, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=TEST_BATCH_SIZE)

model = Net().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)
results_list = []

for epoch in range(1, EPOCHS + 1):
    train(model, DEVICE, train_loader, optimizer, epoch)
    test(model, DEVICE, test_loader, results_list, epoch)
        

Test set: Average loss: 2.4243, Accuracy: 2683.0/5189 (51.7%)

Test set: Average loss: 2.4184, Accuracy: 2660.0/5189 (51.3%)

Test set: Average loss: 2.4300, Accuracy: 2765.0/5189 (53.3%)

Test set: Average loss: 2.4115, Accuracy: 2719.0/5189 (52.4%)

Test set: Average loss: 2.4149, Accuracy: 2757.0/5189 (53.1%)

Test set: Average loss: 2.4082, Accuracy: 2700.0/5189 (52.0%)

Test set: Average loss: 2.4063, Accuracy: 2778.0/5189 (53.5%)

Test set: Average loss: 2.4071, Accuracy: 2813.0/5189 (54.2%)

Test set: Average loss: 2.4106, Accuracy: 2705.0/5189 (52.1%)

Test set: Average loss: 2.4074, Accuracy: 2763.0/5189 (53.2%)

Test set: Average loss: 2.4019, Accuracy: 2773.0/5189 (53.4%)

Test set: Average loss: 2.4115, Accuracy: 2811.0/5189 (54.2%)

Test set: Average loss: 2.4032, Accuracy: 2808.0/5189 (54.1%)

Test set: Average loss: 2.4040, Accuracy: 2825.0/5189 (54.4%)

Test set: Average loss: 2.4096, Accuracy: 2862.0/5189 (55.2%)

Test set: Average loss: 2.4012, Accuracy: 2786.0/5189 (

In [None]:
BATCH_SIZE = 64
TEST_BATCH_SIZE = 64
EPOCHS = 14
LR = 0.002
LOG_INTERVAL = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(10, 64, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Linear(64, 32, bias=False), 
            nn.BatchNorm1d(32),
            nn.ReLU(True),
            nn.Linear(32, len(y_onehot[0])),
            nn.Softmax(1)
        )

    def forward(self, x):
        #x = self.conv(x)
        #x = torch.flatten(x, 1)
        return self.net(x)

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()), end="\r")


def test(model, device, test_loader, results_list, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    criterion = nn.CrossEntropyLoss(reduction='sum')
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    results_list.append([epoch, correct / len(test_loader.dataset), test_loss])

In [None]:
dataset1 = MNIST_Polygon2(train_tensor)
dataset2 = MNIST_Polygon2(test_tensor)
train_loader = torch.utils.data.DataLoader(dataset1,batch_size=BATCH_SIZE, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=TEST_BATCH_SIZE, shuffle = True)

model = Net().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)
results_list = []

for epoch in range(1, EPOCHS + 1):
    train(model, DEVICE, train_loader, optimizer, epoch)
    test(model, DEVICE, test_loader, results_list, epoch)
        

## Approach 3

In [213]:
class MNIST_Polygon3(Dataset):
    
    def __init__(self, tensor, transform=None):
        data1 = [x1 for x1, x2, x3, y, z in tensor]
        data1 = np.asarray(data1)
        self.data1 = torch.reshape(torch.from_numpy(data1).float(), (data1.shape[0], data1.shape[1]*data1.shape[2]))
        data2 = [x2 for x1, x2, x3, y, z in tensor]
        self.data2 = torch.Tensor(data2).float()
        self.data3 = np.asarray([x3 for x1, x2, x3, y, z in tensor])
        self.targets = torch.Tensor([torch.tensor(y).long() for x1, x2, x3, y, z in tensor])
        self.onehot = [z for x1, x2, x3, y, z in tensor]
                
    def __getitem__(self, index):
        x1 = self.data1[index]
        x2 = self.data2[index]
        y = self.targets[index]
        return x1, x2, y
    
    def __len__(self):
        return len(self.data1)
    
MNIST_Polygon3(train_tensor).data1.shape

torch.Size([46017, 198])

In [236]:
BATCH_SIZE = 64
TEST_BATCH_SIZE = 5113
EPOCHS = 10
LR = 0.002
LOG_INTERVAL = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class SymmetryPlusData(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.poly_net = nn.Sequential(
            nn.Linear(198, 256, bias=False),
            #nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.Linear(256, 128, bias=False), 
        )
        
        
        self.feature_net = nn.Sequential(
            nn.Linear(10, 64, bias=False),
            #nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Linear(64, 32, bias=False), 
        )
        
        self.classifier = nn.Sequential(
            #nn.BatchNorm1d(160),
            nn.ReLU(True),
            nn.Linear(160, 64),
            #nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Linear(64, 13),
            nn.Softmax(1)
        )
        
    def forward(self, poly, features):
        #print("poly: " + str(poly))
        #print("features: " + str(features))
        y1 = self.poly_net(poly)
        y2 = self.feature_net(features)
        y = torch.cat([y1, y2], -1)
        return self.classifier(y)

In [237]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (data1, target , data2) in enumerate(train_loader):
        data1, data2, target = data1.to(device), data2.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data1, data2)
        loss = criterion(output, target.long())
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data1), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()), end="\r")


def test(model, device, test_loader, results_list, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    a = 0
    real = datasetb.onehot
    criterion = nn.CrossEntropyLoss(reduction='sum')
    with torch.no_grad():
        for data1, target, data2 in test_loader:
            data1, target, data2 = data1.to(device), target.to(device), data2.to(device)
            output = model(data1, data2)
            test_loss += criterion(output, target.long()).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            #correct += pred.eq(target.view_as(pred)).sum().item()
            for i in pred:
                correct += real[a][i]
                a+=1

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    results_list.append([epoch, correct / len(test_loader.dataset), test_loss])
    preds.append(pred)

In [238]:
dataseta = MNIST_Polygon3(train_tensor)
dataset1 = torch.utils.data.TensorDataset(dataseta.data1, dataseta.targets, dataseta.data2)
datasetb = MNIST_Polygon3(test_tensor)
dataset2 = torch.utils.data.TensorDataset(datasetb.data1, datasetb.targets, datasetb.data2)
train_loader = torch.utils.data.DataLoader(dataset1,batch_size=BATCH_SIZE, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=TEST_BATCH_SIZE)

model = SymmetryPlusData().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)
results_list = []
preds = []

for epoch in range(1, EPOCHS + 1):
    train(model, DEVICE, train_loader, optimizer, epoch)
    test(model, DEVICE, test_loader, results_list, epoch)
        

Test set: Average loss: 2.4428, Accuracy: 3571.0/5113 (69.8%)

Test set: Average loss: 2.4310, Accuracy: 3720.0/5113 (72.8%)

Test set: Average loss: 2.4289, Accuracy: 4049.0/5113 (79.2%)

Test set: Average loss: 2.4237, Accuracy: 3716.0/5113 (72.7%)

Test set: Average loss: 2.4222, Accuracy: 3790.0/5113 (74.1%)

Test set: Average loss: 2.4218, Accuracy: 3904.0/5113 (76.4%)

Test set: Average loss: 2.4231, Accuracy: 3859.0/5113 (75.5%)

Test set: Average loss: 2.4188, Accuracy: 3893.0/5113 (76.1%)

Test set: Average loss: 2.4180, Accuracy: 3868.0/5113 (75.7%)

Test set: Average loss: 2.4205, Accuracy: 3856.0/5113 (75.4%)



In [240]:
original_point_counter = 0
new_point_counter = 0
new_coords = []
a = 0
for geom in datasetb.data1:
    geom = datasetb.data3[a]
    pred = preds[2][a]
    original_point_counter += len(geom)
    if pred == 0:
        new_point_counter += len(geom)
    elif (pred > 0) & (pred < 12):
        simp = simplify_possibilities[pred]
        if simp[0] == 'D-P':
            # Simplification function Douglas-Peucker
            simplified_coordinates = simplify_coords(geom, simp[1])
            new_point_counter += len(simplified_coordinates)
            new_coords.append(simplified_coordinates)

        if simp[0] == 'V-W':
            # Simplification function Visvalingam-Whyatt
            simplified_coordinates = simplify_coords_vw(geom, simp[1])
            new_point_counter += len(simplified_coordinates)
            new_coords.append(simplified_coordinates)
    else:
        pass
        
    a+=1
    
print(original_point_counter)
print(new_point_counter)

107243
91874


In [247]:
np.asarray(new_coords)

pickle.dump(datasetb.data3, open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/original_coords.p", "wb" ) )
pickle.dump(np.asarray(new_coords), open( "/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Scripts/data/temp/simplified_coords.p", "wb" ) )

In [249]:
o = 2761.915
n = 2298.348

print(n / o)

o = 107243
n = 91874

print(n / o)

0.8321573980372314
0.8566899471294164


# Experiments & Code that doesn't get used

# Keras

In [None]:
input_shape = X[0].shape
print(input_shape)
print(len(y_onehot[0]))
model = Sequential()

model.add(layers.Conv1D(filters=32, kernel_size=(5,), activation='relu', padding='SAME', input_shape=input_shape))
model.add(layers.MaxPooling1D(3))
model.add(layers.Conv1D(filters=64, kernel_size=(5,), activation='relu', padding='SAME', strides=2))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(len(y_onehot[0]), activation='softmax'))

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
                optimizer=Adam(lr=0.001), metrics=['accuracy'])

BATCH_SIZE = 99
EPOCHS = 3

history = model.fit(X,
                    y_onehot,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.2,
                    verbose=1)

In [None]:
conn_pand_centrum = create_connection("/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Data/SQLite/Pand_26116_centrum.db")

cur = conn_pand_centrum.cursor()
cur.execute("SELECT data FROM tiles;")

rows = cur.fetchall()
pand_centrum_data = []
for row in rows:
    pand_centrum_data.append(mapbox_vector_tile.decode(row[0]))
    #print(row[0])
print(len(pand_centrum_data))

## Wegdeel Buiten

conn_wegdeel_buiten = create_connection("/Users/davemeijdam/Documents/Data Science/Master/Master Thesis/Data/SQLite/Wegdeel_23770_buitengebied.db")

cur = conn_wegdeel_buiten.cursor()
cur.execute("SELECT data FROM tiles;")

rows = cur.fetchall()
wegdeel_buiten_data = []
for row in rows:
    wegdeel_buiten_data.append(mapbox_vector_tile.decode(row[0]))


Lines = []
Polygons = []
MultiPolygons = []
a=0
for row in pand_centrum_data[:10000]:
    print(str(a) + " / " + str(len(pand_centrum_data)), end="\r")
    a = a + 1
    keys = row.keys()
    
    for key in keys:
        for element in row[key]['features']:
            
            if element['geometry']['type'] == 'LineString': 
                Lines.append(element['geometry']['coordinates'])
            
            if element['geometry']['type'] == 'Polygon':
                Polygons.append(element['geometry']['coordinates'][0])
                
            #if element['geometry']['type'] == 'MultiPolygon':
                #MultiPolygons.append(element['geometry']['coordinates'])
    
    

#test = lvl10_data[0]['spoor.se_fld12_lijngeometrie2d']['features'][0]['geometry']['coordinates']
#print(Polygons)

In [None]:
import shapely.geometry as sg
import shapely.ops as so
import matplotlib.pyplot as plt

ls = []
#for a in wegdeeljson['features'][:5]:
#    ls.append(geometry.Polygon(a['geometry']['coordinates'][0]))

new_shape = so.cascaded_union(ls)
fig, axs = plt.subplots()
axs.set_aspect('equal', 'datalim')

for geom in new_shape.geoms:    
    xs, ys = geom.exterior.xy    
    axs.fill(xs, ys, alpha=1, fc='r', ec='none')

plt.show()

In [None]:
import shapely.geometry as sg
import shapely.ops as so
import matplotlib.pyplot as plt


ls = []
for element in wegdeel_buiten_data[3]['wegdeel.se_fld15_vlakgeometrie2d']['features']:
    
    #print(element['geometry']['coordinates'][0])
    #geometry.Polygon(element['geometry']['coordinates'][0])
    element2 = element['geometry']
    
    if element2['type'] == 'MultiPolygon':
        if element2['coordinates']:
            for poly in element2['coordinates'][0]:
                print(poly)
                ls.append(geometry.Polygon(poly))
    
    else:
        ls.append(geometry.Polygon(element['geometry']['coordinates'][0]))

#r1 = sg.Polygon([[243, 2760], [242, 2760], [242, 2761], [243, 2760]])
#r2 = sg.Polygon([[243, 2759], [243, 2760], [244, 2760], [244, 2759], [243, 2759]])
#r3 = sg.Polygon([[244, 2759], [243, 2759], [243, 2760], [244, 2760], [244, 2759]])
#r4 = sg.Polygon([[243, 2759], [242, 2759], [242, 2760], [243, 2760], [243, 2759]])
#r5 = sg.Polygon([[241, 2759], [241, 2760], [242, 2759], [241, 2759]])

new_shape = so.cascaded_union(ls)
fig, axs = plt.subplots()
axs.set_aspect('equal', 'datalim')

for geom in new_shape.geoms:    
    xs, ys = geom.exterior.xy    
    axs.fill(xs, ys, alpha=1, fc='r', ec='none')

plt.show()

In [None]:
# Select index of simplification possibility
INDEX = 6


possibility = simplify_possibilities[INDEX]

if possibility[0] == 'D-P':
    # Simplification function Douglas-Peucker
    simplified_coordinates = simplify_coords(coordinates, possibility[1])

if possibility[0] == 'V-W':
    # Simplification function Visvalingam-Whyatt
    simplified_coordinates = simplify_coords_vw(coordinates, possibility[1])

old_xs, old_ys = zip(*coordinates)
new_xs, new_ys = zip(*simplified_coordinates)

print(len(simplified_coordinates))
print(len(coordinates))

In [None]:
Lines = []
Polygons = []
MultiPolygons = []
a=0
for row in wegdeel_buiten_data:
    print(str(a) + " / " + str(len(wegdeel_buiten_data)), end="\r")
    a = a + 1
    keys = row.keys()
    
    for key in keys:
        for element in row[key]['features']:
            
            if element['geometry']['type'] == 'LineString': 
                Lines.append(element['geometry']['coordinates'])
            
            if element['geometry']['type'] == 'Polygon':
                Polygons.append(element['geometry']['coordinates'][0])
                
            if element['geometry']['type'] == 'MultiPolygon':
                if element['geometry']['coordinates']:
                    for poly in element['geometry']['coordinates'][0]:
                        MultiPolygons.append(poly)
    
    

#test = lvl10_data[0]['spoor.se_fld12_lijngeometrie2d']['features'][0]['geometry']['coordinates']
#print(Polygons)

#print(len(Lines))
print(len(Polygons))
#print(len(MultiPolygons))

ls = []
for a in Polygons:
    ls.append(len(a))
    
pd.DataFrame({'lengths':Counter(ls).keys(),
              'freq':Counter(ls).values()})

In [None]:
results_list = []
a=0

for element in Polygons[:100]:
    results_dict = {}
    poly1 = geometry.Polygon(element)
    results = []
    
    for possibility in simplify_possibilities:
        
        if possibility[0] == 'D-P':
            # Simplification function Douglas-Peucker
            time_start = time()
            simplified_coordinates = simplify_coords(element, possibility[1])
            time_end = time()
            process_time = time_end - time_start

        if possibility[0] == 'V-W':
            # Simplification function Visvalingam-Whyatt
            time_start = time()
            simplified_coordinates = simplify_coords_vw(element, possibility[1])
            time_end = time()
            process_time = time_end - time_start
        
        
        if len(simplified_coordinates) >= 3:
            poly2 = geometry.Polygon(simplified_coordinates)
            #length_deficit = (poly2.length - poly1.length) / poly1.length
        
            # If the length deficit of the polygon is smaller(greater) than the provided MAX_LENGTH_DEFICIT, 
            # the score gets saved
            #if length_deficit > MAX_LENGTH_DEFICIT:
            
            #if length_deficit == 0:
            #    score = ScoreFormula(len(element[0]), len(simplified_coordinates), process_time)
            #    results.append(score)
            #    continue
                
            #try:
            #    if CheckSameIntersections(element[0], simplified_coordinates, grid, ROUNDING) > MIN_INTERSECTIONS_PERC:
            #        score = ScoreFormula(len(element[0]), len(simplified_coordinates), process_time)
            #        results.append(score)
            #except Exception:
            #    continue
            
            if np.isnan(check_pixel_similarity(element, simplified_coordinates, 17)) == True:
                results.append('Remove')
                break
                
                
            if check_pixel_similarity(element, simplified_coordinates, 17) == 1:
                score = ScoreFormula(len(element), len(simplified_coordinates), process_time)
                results.append(score)
        
    results_dict['index'] = a
    results_dict['algorithm'] = results.index(max(results))
    results_list.append(results_dict)
    a = a + 1
    
results_list

In [None]:
RESULTS_INDEX = 85

algorithm = simplify_possibilities[results_list[RESULTS_INDEX]['algorithm']]
print(algorithm)
points = len(Polygons[results_list[RESULTS_INDEX]['index']])
o_xs, o_ys = zip(*Polygons[results_list[RESULTS_INDEX]['index']])
#geometry.Polygon(Polygons[results_list[RESULTS_INDEX]['index']])

if algorithm[0] == 'D-P':
    simplified_poly = geometry.Polygon(simplify_coords(Polygons[results_list[RESULTS_INDEX]['index']], algorithm[1]))
    simplified_points = len(simplify_coords(Polygons[results_list[RESULTS_INDEX]['index']], algorithm[1]))
    s_xs, s_ys = zip(*simplify_coords(Polygons[results_list[RESULTS_INDEX]['index']], algorithm[1]))
    
if algorithm[0] == 'V-W':
    simplified_poly = geometry.Polygon(simplify_coords_vw(Polygons[results_list[RESULTS_INDEX]['index']], algorithm[1]))
    simplified_points = len(simplify_coords_vw(Polygons[results_list[RESULTS_INDEX]['index']], algorithm[1]))
    s_xs, s_ys = zip(*simplify_coords_vw(Polygons[results_list[RESULTS_INDEX]['index']], algorithm[1]))

print(str(simplified_points) + " / " + str(points))
geometry.Polygon(Polygons[results_list[RESULTS_INDEX]['index']])
simplified_poly    

plt.subplot(1, 2, 1)
plt.plot(o_xs,o_ys)

plt.subplot(1, 2, 2)
plt.plot(s_xs,s_ys)

plt.show()

In [None]:
99*5


In [None]:
plt.imshow(check_pixel_similarity(Polygons[4], Polygons[0], 20))

In [None]:
original = geometry.Polygon(Polygons[5])
simplified = geometry.Polygon(simplify_coords(Polygons[62],0.5))
simplified

In [None]:
figure, axis = plt.subplots(3, 3)

axis[0,0].imshow(check_pixel_similarity(Polygons[8], Polygons[0], 20))
axis[0,1].imshow(check_pixel_similarity(Polygons[2], Polygons[0], 19))
axis[0,2].imshow(check_pixel_similarity(Polygons[4], Polygons[0], 17))

axis[1,0].imshow(check_pixel_similarity(Polygons[10], Polygons[0], 19))
axis[1,1].imshow(check_pixel_similarity(Polygons[12], Polygons[0], 18))
axis[1,2].imshow(check_pixel_similarity(Polygons[16], Polygons[0], 18))

axis[2,0].imshow(check_pixel_similarity(Polygons[32], Polygons[0], 19))
axis[2,1].imshow(check_pixel_similarity(Polygons[26], Polygons[0], 18))
axis[2,2].imshow(check_pixel_similarity(Polygons[24], Polygons[0], 20))