In [4]:
import psycopg2
from creds import *
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from src.resources.resources import test_pipe, count_hash
import json
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV

In [5]:
conn_info = {
    "host": "localhost",
    "port": 5432,
    "database": "pointclouds",
    "user": username,
    "password": password
}

conn = psycopg2.connect(**conn_info)
curs = conn.cursor("named")

In [6]:
cols = ['label', 'linearity', 'planarity', 'scattering', 'verticality']
curs.execute(f"SELECT {','.join(cols)} FROM oakland_points")

In [7]:
df = pd.DataFrame(curs.fetchall(), columns=cols)
curs.close()

df = df.astype({
    'label': int,
    'linearity': float,
    'planarity': float,
    'scattering': float,
    'verticality': float
})

df.head()

Unnamed: 0,label,linearity,planarity,scattering,verticality
0,1200,0.798333,0.136767,0.0649,0.351273
1,1200,0.798333,0.136767,0.0649,0.351273
2,1200,0.798333,0.136767,0.0649,0.351273
3,1200,0.798333,0.136767,0.0649,0.351273
4,1200,0.798333,0.136767,0.0649,0.351273


In [8]:
X = df[['linearity','planarity','scattering','verticality']]
y = [1 if x == 1103 else 0 for x in df['label']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

train_data = xgb.DMatrix(data=X_train, label=y_train)
test_data = xgb.DMatrix(data=X_test, label=y_test)

In [9]:
pos_weight = (len(y) - sum(y))/sum(y)
print(pos_weight)

weights = [1,100, 150, 200, 500]
param_grid = dict(scale_pos_weight=weights)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

model = xgb.XGBClassifier(scale_pos_weight=pos_weight)

#grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

#grid_result = grid.fit(X, y)

scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)

218.47192666557538


In [10]:
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print(scores)

[0.96775496 0.96625193 0.96344889 0.96284503 0.96431488 0.96557586
 0.96353408 0.96560652 0.9617543  0.96892699 0.96351185 0.96582142
 0.96095768 0.96171466 0.9650831  0.96738493 0.96308455 0.96547268
 0.96279047 0.96453597 0.96373284 0.96702459 0.9641713  0.968369
 0.96607845 0.9620562  0.96073355 0.96548048 0.96552418 0.96723771]


In [11]:
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print(f'Mean ROC AUC: {np.mean(scores)}')

Mean ROC AUC: 0.9646926345373286


In [12]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=218.47192666557538,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [None]:
from collections import Counter
count = Counter(y)
print(count)

In [13]:
import joblib
joblib.dump(model, 'src/models/xgboostv1.1.dat')

['src/models/xgboostv1.1.dat']