In [1]:
import os
os.chdir('/home/chapmaca/Projects/pointcloudclassifier')
from joblib import dump, load
import pandas as pd
import pprint
from sklearn.preprocessing import StandardScaler
from src.tools.pc_tools import dbscan_cluster

In [2]:
data = pd.read_csv('./data/processed/pole_data_full.txt')
poles = [1110, 1302, 1303]
y = data.label.apply(lambda row: 1 if row in poles else 0)
X = pd.DataFrame(StandardScaler().fit_transform(data.drop(['X', 'Y', 'Z', 'label','conf'], axis=1)))

In [3]:
pole_find = load('./models/pole_knn_v1.0.joblib')

In [4]:
poles = pole_find.predict(X)
poles = poles.astype(bool)

In [5]:
pole_clusters = dbscan_cluster(data[['X', 'Y', 'Z']][poles], eps=15)

Found 20 clusters in the data. With 19 noise points.


In [6]:
test = y.astype(bool)
test_clusters = dbscan_cluster(data[['X', 'Y', 'Z']][test], eps=15)

Found 22 clusters in the data. With 0 noise points.


data.shape()

In [7]:
df = pd.DataFrame(range(1, len(data)+1))

In [8]:
df['poles'] = -1
df['test'] = -1
df['test'][test] = test_clusters.labels_
df['poles'][poles] = pole_clusters.labels_

In [9]:
#This shows the overlaps of poles...each pole found by the algorithm is a key in the dict
#for the entries for each pole a -1 represents pole points that weren't identified (but doesn't count them)
#a number >= 0 is a group identified in the test set. This is telling me that we're finding all the poles
# would be helpful to know how many poles there are...might be worth investigating at a later time.
found = dict()
for pole in set(pole_clusters.labels_):
    found[pole] = list()
    if pole != -1:
        found[pole].append(set(df[df['poles']==pole]['test']))
pprint.pprint(found)

{-1: [],
 0: [{0, 13, -1}],
 1: [{1, -1}],
 2: [{2, -1}],
 3: [{5, -1}],
 4: [{3, -1}],
 5: [{4, -1}],
 6: [{6, -1}],
 7: [{-1, 7}],
 8: [{8, -1}],
 9: [{9}],
 10: [{10, -1}],
 11: [{11, -1}],
 12: [{12, -1}],
 13: [{14, -1}],
 14: [{15}],
 15: [{16}],
 16: [{18, -1}],
 17: [{17, -1}],
 18: [{19, 20, -1}],
 19: [{21, -1}]}


In [10]:
from src.tools.pc_tools import geometric_median, point_to_line_dist

In [18]:
from itertools import combinations
import numpy as np
dists = pd.DataFrame({'save':np.full(len(data), 1000), 'hold':np.full(len(data), 1000)})
pole_set = list(set(df['poles']))
pole_set.remove(-1)
# Find the distance of each point to a line between two identified poles. Of note, need to see if this
# is from the line segment or a line stretching into infinity.
for i, j in combinations(pole_set, 2):
    pt1 = geometric_median(data[df['poles']==i][['X','Y']].to_numpy())
    pt2 = geometric_median(data[df['poles']==j][['X','Y']].to_numpy())
    dists['hold'] = point_to_line_dist(pt1, pt2, data[['X', 'Y']])
    dists['save'] = dists[['save','hold']].min(axis=1)
data['pole_dist'] = dists['save']

In [12]:
import numpy as np
np.save('./data/processed/data_full_w_poles.npy', data.to_records())

In [13]:
#None of the following should be needed. I had to fix the column names in the data file.
#data = pd.read_csv('./data/processed/pole_data_full.txt')
#data.head()

In [14]:
#import numpy as np
#new = pd.DataFrame(np.load('data/processed/data_full_w_poles.npy'))


In [15]:
#new.columns = names
#new.head()

In [16]:
#names = list(data.columns)

In [17]:
#names.append("pole_dist")
#names