# Modeling - data normalized "share"

## Importing libraries

In [59]:
import pandas as pd
import numpy as np
import scipy
from math import sqrt
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

Estimators: regressors

In [60]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

Estimators: classifiers

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

Metrics:

In [62]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import max_error

Cross-validation:

In [63]:
from sklearn.model_selection import train_test_split

## Reading dataset

In [64]:
trainset = pd.read_csv("C:/Users/edidd/Documents/Ubiqum/Data Analytics Course/Module_III_Wifi/data/train_clean_norm.csv", header = 0)

In [65]:
trainset.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP,ref
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733,290
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691,299
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024169,0.0,0.0,...,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095,342
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7524.5704,4864934.0,2,1,102,2,2,23,1371713807,328
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7632.1436,4864982.0,0,0,122,2,11,13,1369909710,195


## Selecting the data for modeling

In [66]:
features = trainset.iloc[:,:520]

In [67]:
features.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024169,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
dependent = trainset["LONGITUDE"]

In [69]:
dependent.head()

0   -7541.2643
1   -7536.6212
2   -7519.1524
3   -7524.5704
4   -7632.1436
Name: LONGITUDE, dtype: float64

In [70]:
features.shape

(19937, 520)

In [71]:
dependent.shape

(19937,)

## Creating model variables

In [72]:
modelSVR = SVR()

In [73]:
modelRF = RandomForestRegressor()

In [74]:
modelLR = LinearRegression()

In [75]:
model5NN = KNeighborsRegressor(n_neighbors=5, n_jobs= -1)

In [76]:
model3NN = KNeighborsRegressor(n_neighbors=3, n_jobs= -1)

## Training the models

### KNN

In [77]:
model5NN.fit(features, dependent)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                    weights='uniform')

In [21]:
model5NN.score(features, dependent)

0.9970240352647581

## Predicting

### Importing the validation set

In [78]:
validation = pd.read_csv("C:/Users/edidd/Documents/Ubiqum/Data Analytics Course/Module_III_Wifi/data/validation_clean_norm.csv", header = 0)

In [79]:
validation.tail()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
1106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-7317.344231,4864796.0,3,2,0,0,0,13,1381156711
1107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-7313.73112,4864792.0,3,2,0,0,0,13,1381156730
1108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-7637.535798,4864903.0,0,0,0,0,0,13,1381247781
1109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-7636.654005,4864905.0,0,0,0,0,0,13,1381247807
1110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-7637.94412,4864904.0,0,0,0,0,0,13,1381247836


In [80]:
val_features = validation.iloc[:,:520]

In [81]:
val_features.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
val_dependent = validation["LONGITUDE"]

### Predicting 5NN

In [83]:
prediction_5nn = model5NN.predict(val_features)

In [29]:
model5NN.score(val_features, val_dependent)

0.9952141724274285

In [100]:
mean_absolute_error(val_dependent, prediction_5nn)

5.24477539908919

#### Plotting real LONGITUDE vs predicted LONGITUDE

In [84]:
y1 = validation.LONGITUDE
y2 = prediction_5nn

In [85]:
y1.head()

0   -7515.916799
1   -7383.867221
2   -7374.302080
3   -7365.824883
4   -7641.499303
Name: LONGITUDE, dtype: float64

In [86]:
y2 = pd.Series(prediction_5nn)

In [87]:
y2.shape

(1111,)

In [88]:
y1.shape

(1111,)

In [89]:
type(y1)

pandas.core.series.Series

In [90]:
type(y2)

pandas.core.series.Series

In [91]:
y2.head()

0   -7490.05486
1   -7385.20748
2   -7370.01070
3   -7372.24446
4   -7644.93706
dtype: float64

In [92]:
prediction_LONG = pd.concat([y1, y2], axis = 1)

In [93]:
prediction_LONG.tail()

Unnamed: 0,LONGITUDE,0
1106,-7317.344231,-7314.276376
1107,-7313.73112,-7313.73112
1108,-7637.535798,-7638.72752
1109,-7636.654005,-7639.54684
1110,-7637.94412,-7638.90632


In [94]:
type(prediction_LONG)

pandas.core.frame.DataFrame

In [95]:
prediction_LONG.dtypes

LONGITUDE    float64
0            float64
dtype: object

In [96]:
prediction_LONG = prediction_LONG.rename(columns={0 : "prediction"}).sort_values("LONGITUDE").reset_index()

In [97]:
prediction_LONG.head()

Unnamed: 0,index,LONGITUDE,prediction
0,422,-7695.938755,-7681.72488
1,416,-7695.931955,-7661.19456
2,589,-7692.25645,-7674.83018
3,665,-7691.740181,-7684.78208
4,61,-7691.1092,-7685.96518


In [98]:
LONG_plot = px.scatter(prediction_LONG, y = "LONGITUDE")
LONG_plot.show()

In [99]:
prediction_trace = go.Scatter(
    y=prediction_LONG["prediction"],
    mode="lines",
    line=go.scatter.Line(color="red"),
    showlegend=False)

In [46]:
LONG_plot.add_trace(prediction_trace)
LONG_plot.show()

In [47]:
sqrt(mean_squared_error(prediction_LONG["LONGITUDE"], prediction_LONG["prediction"]))

8.31229947513902

In [48]:
max_error(prediction_LONG["LONGITUDE"], prediction_LONG["prediction"])

72.91313461957998

In [49]:
mean_absolute_error(prediction_LONG["LONGITUDE"], prediction_LONG["prediction"])

5.24477539908919

### Clasification estomators for building and floor

Model variable function

In [50]:
modelRFC = RandomForestClassifier(n_estimators=50)

In [51]:
dependent = trainset["BUILDINGID"]

In [52]:
dependent.head()

0    1
1    1
2    1
3    1
4    0
Name: BUILDINGID, dtype: int64

In [53]:
val_dependent = validation["BUILDINGID"]

In [54]:
val_dependent.head()

0    1
1    2
2    2
3    2
4    0
Name: BUILDINGID, dtype: int64

In [55]:
features.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024169,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
modelRFC.fit(features, dependent)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [57]:
prediction_rfc = modelRFC.predict(val_features)

In [58]:
modelRFC.score(val_features, val_dependent)

0.9990999099909991