In [1]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.feature_selection import RFE,RFECV

In [89]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error,mean_squared_error

### Wifi signals 

In [2]:
data = pd.read_csv('Data/UJIndoorLoc/trainingData.csv')

In [3]:
data.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,100,100,100,100,...,100,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,100,100,100,100,100,100,100,100,100,100,...,100,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,100,100,100,100,100,100,100,-97,100,100,...,100,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095
3,100,100,100,100,100,100,100,100,100,100,...,100,-7524.5704,4864934.0,2,1,102,2,2,23,1371713807
4,100,100,100,100,100,100,100,100,100,100,...,100,-7632.1436,4864982.0,0,0,122,2,11,13,1369909710


In [4]:
data.shape

(19937, 529)

### Extracting building 0

In [5]:
data_zero = data[data["BUILDINGID"]==0]

In [6]:
data_zero.shape

(5249, 529)

### Extracting X and y 

In [7]:
y_floor = data_zero["FLOOR"]

In [8]:
y_floor.shape

(5249,)

In [9]:
latitude_scaler = MinMaxScaler()
longitude_scaler = MinMaxScaler()

y_long = longitude_scaler.fit_transform(data_zero[["LONGITUDE"]])[:,0]
y_lat = longitude_scaler.fit_transform(data_zero[["LATITUDE"]])[:,0]

In [10]:
X = data_zero.iloc[:,0:520]

In [11]:
X.shape

(5249, 520)

In [12]:
X.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
7119,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
7120,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
7121,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
7122,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### No-signal change 100 into -150

In [13]:
X[X == 100] = -150

In [14]:
X.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
4,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150,...,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150
7119,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150,...,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150
7120,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150,...,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150
7121,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150,...,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150
7122,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150,...,-150,-150,-150,-150,-150,-150,-150,-150,-150,-150


## Floor classification

### Splitting into train and test dataset

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y_floor,test_size=0.2)

### Pipeline
* Przeskalowanie danych
* Wyodrebnienie cech
* Standaryzacja danych

In [16]:
preprocessing_pipeline = Pipeline([
    ('scaler',MinMaxScaler()),
    ('zero_variance',VarianceThreshold(0.0)),
    ('feature_selection',RFE(RandomForestClassifier(),step=10)),
    ('model',KNeighborsClassifier())
])

In [18]:
param_grid = {'feature_selection__n_features_to_select':[500,400,240,120],
             'model__n_neighbors':[3,5]
             }

In [21]:
X_train.shape

(4199, 520)

In [22]:
y_train.shape

(4199,)

In [23]:
grid = GridSearchCV(preprocessing_pipeline,
                   param_grid,
                   cv=KFold(n_splits=5,shuffle=True),
                   n_jobs = -1)
grid.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('zero_variance', VarianceThreshold(threshold=0.0)), ('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'feature_selection__n_features_to_select': [500, 400, 240, 120], 'model__n_neighbors': [3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [24]:
grid.grid_scores_



[mean: 0.97357, std: 0.00485, params: {'feature_selection__n_features_to_select': 500, 'model__n_neighbors': 3},
 mean: 0.96928, std: 0.00752, params: {'feature_selection__n_features_to_select': 500, 'model__n_neighbors': 5},
 mean: 0.97357, std: 0.00485, params: {'feature_selection__n_features_to_select': 400, 'model__n_neighbors': 3},
 mean: 0.96928, std: 0.00752, params: {'feature_selection__n_features_to_select': 400, 'model__n_neighbors': 5},
 mean: 0.97357, std: 0.00485, params: {'feature_selection__n_features_to_select': 240, 'model__n_neighbors': 3},
 mean: 0.96928, std: 0.00752, params: {'feature_selection__n_features_to_select': 240, 'model__n_neighbors': 5},
 mean: 0.97738, std: 0.00465, params: {'feature_selection__n_features_to_select': 120, 'model__n_neighbors': 3},
 mean: 0.96904, std: 0.00635, params: {'feature_selection__n_features_to_select': 120, 'model__n_neighbors': 5}]

In [48]:
grid.best_params_

{'feature_selection__n_features_to_select': 120, 'model__n_neighbors': 3}

In [25]:
best_pipieline = grid.best_estimator_

In [29]:
best_pipieline.steps

[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('zero_variance', VarianceThreshold(threshold=0.0)),
 ('feature_selection',
  RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False),
    n_features_to_select=120, step=10, verbose=0)),
 ('model',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=3, p=2,
             weights='uniform'))]

### Variance threshold step

In [47]:
variance_columns = X_train.columns[best_pipieline.steps[1][1].get_support()]
variance_columns.shape

(196,)

### Recursive feature extraction

In [50]:
rfe_columns = variance_columns[best_pipieline.steps[2][1].get_support()]

In [60]:
rfe_columns

Index(['WAP007', 'WAP008', 'WAP009', 'WAP013', 'WAP014', 'WAP017', 'WAP018',
       'WAP019', 'WAP020', 'WAP023',
       ...
       'WAP402', 'WAP404', 'WAP409', 'WAP410', 'WAP413', 'WAP415', 'WAP434',
       'WAP447', 'WAP452', 'WAP494'],
      dtype='object', length=120)

In [52]:
rfe = best_pipieline.steps[2][1]

In [53]:
rfe.n_features_

120

In [62]:
knn = best_pipieline.steps[3][1]

### Test data floor

In [71]:
accuracy_score(y_true=y_test,y_pred=best_pipieline.predict(X_test))

0.97809523809523813

In [72]:
pd.value_counts(y_test)

2    302
3    296
1    250
0    202
Name: FLOOR, dtype: int64

In [117]:
preprocessing_pipeline_reg = Pipeline([
    ('scaler',MinMaxScaler()),
    ('zero_variance',VarianceThreshold(0.0)),
    ('feature_selection',RFE(RandomForestRegressor(),step=10)),
    ('model',KNeighborsRegressor())
])

## Latitude estimator

In [82]:
X_train,X_test,y_train,y_test = train_test_split(X,y_lat,test_size=0.2)

In [83]:
preprocessing_pipeline_regression = Pipeline([
    ('data scaling', MinMaxScaler()),
    ('remove zero variance', VarianceThreshold()),
    ('feature selection', RFE(RandomForestRegressor(), step=20)),
    ('model', KNeighborsRegressor())
])

In [84]:
param_grid_regression = {
    'feature selection__n_features_to_select': [500, 400, 300, 240, 120],
    'model__n_neighbors': [3, 5, 8]
}

In [85]:
grid_regression = GridSearchCV(preprocessing_pipeline_regression,
                              param_grid = param_grid_regression,
                              cv=KFold(5,shuffle=True),
                              n_jobs=-1)
grid_regression.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(steps=[('data scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('remove zero variance', VarianceThreshold(threshold=0.0)), ('feature selection', RFE(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
        ...nkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'feature selection__n_features_to_select': [500, 400, 300, 240, 120], 'model__n_neighbors': [3, 5, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [86]:
grid_regression.best_score_

0.98743841248010855

In [87]:
grid_regression.best_params_

{'feature selection__n_features_to_select': 120, 'model__n_neighbors': 3}

In [88]:
best_latitude = grid_regression.best_estimator_

### Test data latitude

In [90]:
mean_absolute_error(y_true=y_test,y_pred=best_latitude.predict(X_test))

0.01512090350036964

In [91]:
mean_squared_error(y_true=y_test,y_pred=best_latitude.predict(X_test))

0.00096548385888108707