In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
#import matplotlib as plt
import matplotlib.pyplot as plt
import sklearn.preprocessing as sp
%matplotlib inline
sns.set_style('whitegrid')
sns.set_context("poster")
sns.set_context("talk", 1.0, {"lines.linewidth": 3})

### pattern16のデータを分析
以下のような4隅に受信機を設置。
その後、1つのビーコンを下の図の①〜⑨に15分ごとに移動し設置しました。  
この状態で、計測を行いました。今回はテストデータが多く欲しかったので、前回より多めの9つを設置しました。  
<img src="./RSSI取得実験図/パターン16の配置図.pdf" alt="パターン16">
<img src="./RSSI取得実験図/pattern16.jpg" alt="パターン16-1">

In [2]:
#パターン16のデータの取得
df16_1 = pd.read_csv("./取得データセット/受信機1/nkmr1-dataset16.csv")
df16_2 = pd.read_csv("./取得データセット/受信機2/nkmr2-dataset16.csv")
df16_3 = pd.read_csv("./取得データセット/受信機3/nkmr3-dataset16.csv")
df16_4 = pd.read_csv("./取得データセット/受信機4/nkmr4-dataset16.csv")

#それぞれのカラムの指定
df16_1.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp","x","y"]
df16_2.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp","x","y"]
#df16_x.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp","x","y"]
df16_3.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp","x","y"]
df16_4.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp","x","y"]

#今回は受信機3のnkmr-4をrasp3に変更
df16_3.device_name = "rasp3"

df16 = df16_1.append(df16_2).append(df16_3).append(df16_4)

#同期ビーコンのid
set_uuid = "1000000007ae1001b000001c4d8dffff"

#locateを作成
df16["locate"] = 0
df16.locate = df16.locate.where((df16.x != 0) | (df16.y != 0),1)
df16.locate = df16.locate.where((df16.x != 0) | (df16.y != 100),2)
df16.locate = df16.locate.where((df16.x != 0) | (df16.y != 200),3)
df16.locate = df16.locate.where((df16.x != 100) | (df16.y != 0),4)
df16.locate = df16.locate.where((df16.x != 100) | (df16.y != 100),5)
df16.locate = df16.locate.where((df16.x != 100) | (df16.y != 200),6)
df16.locate = df16.locate.where((df16.x != 200) | (df16.y != 0),7)
df16.locate = df16.locate.where((df16.x != 200) | (df16.y != 100),8)
df16.locate = df16.locate.where((df16.x != 200) | (df16.y != 200),9)

#deviceを作成
df16["device"] = 0
df16.device = df16.device.where(df16.device_name != "rasp1",1)
df16.device = df16.device.where(df16.device_name != "nkmr-2",2)
df16.device = df16.device.where(df16.device_name != "rasp3",3)
df16.device = df16.device.where(df16.device_name != "nkmr-4",4)

#地点ごとに分ける
df16_1s = df16[(df16.locate == 1)]
df16_2s = df16[(df16.locate == 2)]
df16_3s = df16[(df16.locate == 3)]
df16_4s = df16[(df16.locate == 4)]
df16_5s = df16[(df16.locate == 5)]
df16_6s = df16[(df16.locate == 6)]
df16_7s = df16[(df16.locate == 7)]
df16_8s = df16[(df16.locate == 8)]
df16_9s = df16[(df16.locate == 9)]

df_list = [df16_1s,df16_2s,df16_3s,df16_4s,df16_5s,df16_6s,df16_7s,df16_8s,df16_9s]

In [4]:
df16.count()

UUIDs          541713
Major          541713
Minor          541713
Accuracy       541713
Proximity      541713
TxPower        541713
RSSI           541713
device_name    541713
timestamp      541713
x              541713
y              541713
locate         541713
device         541713
dtype: int64

UUIDs       : ビーコンを識別するために割り振られているID  
RSSI        : 受信機が受信したそのビーコンの電波強度  
device_name : 計測に使用した受信機に割り振った名前  
x,y         : そのビーコンの位置を示す、x座標、y座標  

### 時刻合わせ

In [33]:
#　オブジェクト型の目的変数を数値情報に変換
import sklearn.preprocessing as sp

# 前処理用のインスタンスを宣言
le = sp.LabelEncoder()

def set_time(data,set_uuid):
    """
    時刻合わせを行う
    第一引数はdataframe
    第２引数は時刻合わせを行うUUID
    """
    # 時刻合わせ用のビーコンを1秒間隔に設定
    set_time_df = data[data.UUIDs == set_uuid].drop_duplicates(['Minor'],keep = "first")
    # 時刻合わせ用ビーコン以外を定義
    df = data[data.UUIDs != set_uuid]
    # データを結合してインデックスでソート
    df = df.append(set_time_df).sort_index(ascending=True)
    # 時刻合わせ用ビーコンを取得した行にtimeカラムを設定
    df["time"] = df["Minor"].where(df["UUIDs"] == set_uuid)
    # 時刻合わせを行い欠損値がある部分は削除
    df = df.fillna(method="ffill").dropna()
    # int型に変換
    df["time"] = df["time"].astype(int)
    # 時刻合わせ用のビーコンデータ以外
    df = df[df["UUIDs"] != set_uuid]
    
    return df

# 前処理を各データフレームに行う
for (i,df) in enumerate(df_list):
    df_list[i] = set_time(df,set_uuid)
    # device_nameを変換
    #df_list[i]["device"] = le.fit_transform(df_list[i][df_list[i].columns[7]]) + 1

設置した同期用のビーコンに設定されている時間を他のビーコンにも割り振り、それぞれの受信機の時間を合わせる

### 直接波と反射波を識別する

In [34]:
# KMeansライブラリをインポート
from sklearn.cluster import KMeans

In [35]:
def make_cluster(df_list,colum,*,N_CLUSTERS=2):
    """
          反射波と入射波を判別するクラスタを作成し描写を行う
          第1引数(地点毎のデータフレームが入ったリスト)
          第２引数(クラスタリングを行いたいカラムのリスト)
          第３引数(クラスタ数)
          データフレーム型でクラスタリング結果を返す
    """

    # 空のデータフレームを用意
    df_cluster = pd.DataFrame()
    concat_list = []
    #座標ごとに
    for df in df_list: 
        #受信機ごとに
        for dev in df.device_name.unique():
            #受信機ごとに分けたdataframe
            df_dev = df[df.device_name == dev].copy()
            # 指定されたカラムを利用してクラスタリング
            pred = KMeans(n_clusters=N_CLUSTERS).fit_predict(np.array(df_dev[colum]))
            # clusterカラムを作成し予測結果を代入
            df_dev.loc[:,("cluster")] = pred
            # RSSI値の平均値を比較して入射波と反射波の判定を行う
            if df_dev[df_dev["cluster"] == 0]["RSSI"].mean() > df_dev[df_dev["cluster"] == 1]["RSSI"].mean():
                pass
            else:
                df_dev.loc[df_dev["cluster"] == 0,"cluster"] = 3
                df_dev.loc[df_dev["cluster"] == 1,"cluster"] = 0
                df_dev.loc[df_dev["cluster"] == 3,"cluster"] = 1
            # 予測結果を格納
            df_cluster = pd.concat([df_cluster,df_dev],ignore_index=True)
        concat_list.append(df_cluster)
        # 空のデータフレームを用意
        df_cluster = pd.DataFrame()

    return concat_list

In [36]:
df_list = make_cluster(df_list,["RSSI","locate"]) 

生のデータでは、直接波と反射波が混ざっているので、後で学習しやすいように直接波には0、反射波には1とラベル付けを行う。

### 1秒ごとのRSSIの平均値をとる

In [37]:
def max_value(df_list):
    # timeのmax値を計算
    for df in df_list:
        max_value = 0
        if max_value < df["time"].max():
            max_value = df["time"].max()
    return max_value

#座標として、引数にx,yを追加
def get_mean(df,colum_name,locate,x,y,time_max):    
    """
    処理時間かかります。
    時刻毎の中央値をとる
    第一引数、データフレーム
    第２引数、作成したいカラム名
    """
    # 空のデータフレームとリストを用意
    Direct_list = []
    Reflect_list = []
    mid_df = pd.DataFrame()
    
    # timeのユニーク数だけ
    for time_num in range(time_max): 
        # 受信機の数だけ
        for i in range(len(df_list[0]["device"].unique())): 
            # timeラベルが同じ値に対して平均値をとる
            Direct = df[(df["time"] == time_num) & (df["device"] == (i+1)) & (df["cluster"] == 0)]["RSSI"].mean()
            Reflect = df[(df["time"] == time_num) & (df["device"] == (i+1)) & (df["cluster"] == 1)]["RSSI"].mean()
            # 平均値をリストに追加する
            Direct_list.append(Direct)
            Reflect_list.append(Reflect)
        # 平均値を追加したリストに(時間、地点、クラスタ番号を追加)
        Direct_list.extend([time_num,locate,x,y,0])
        Reflect_list.extend([time_num,locate,x,y,1])
        # データフレームに各要素を追加
        mid_df = mid_df.append(pd.Series(Direct_list),ignore_index=True)
        mid_df = mid_df.append(pd.Series(Reflect_list),ignore_index=True)
        # 平均値のリストを初期化
        Direct_list = []
        Reflect_list = []
    # カラム名を定義
    mid_df.columns = [colum_name]
    
    return mid_df

#前処理を各データフレームと地点毎に行う
train_df = pd.DataFrame()
column_name = ["dev1","dev2","dev3","dev4","time","locate","x","y","cluster"]
#column_name = ["dev1","dev2","dev3","dev4","time","locate","cluster"]
time_max = max_value(df_list)

for df in df_list:
    locate = df["locate"].iloc[1]
    x = df["x"].iloc[1]
    y = df["y"].iloc[1]
    mid_df = get_mean(df,column_name,locate,x,y,time_max)
    train_df = pd.concat([train_df,mid_df])

KeyboardInterrupt: 

それぞれの地点での1秒ごとのRSSIの平均値を計算し、加工する。

In [None]:
df_list[0][(df_list[0].device == 1) & (df_list[0].cluster == 1)].RSSI.mean()

In [None]:
#NANのある行を削除し、time,locate,x,y,clusterをintに変換
direct = train_df[train_df.cluster == 0].dropna(thresh=4).interpolate().dropna()
reflect = train_df[train_df.cluster == 1].dropna(thresh=4).interpolate().dropna()
train = pd.concat([direct,reflect])
train.locate = train.locate.astype("int")
train.time = train.time.astype("int")
train.x = train.x.astype("int")
train.y = train.y.astype("int")
train.cluster = train.cluster.astype("int")
train.count()

In [None]:
#学習用とテスト用に分ける
#train_s = train[(train.locate != 2) & (train.locate != 4) & (train.locate != 6) & (train.locate != 8)]
#test_s =  train[(train.locate == 2) | (train.locate == 4) | (train.locate == 6) | (train.locate == 8)]
train_s = train[(train.locate != 2) & (train.locate != 4) & (train.locate != 5) & (train.locate != 6) & (train.locate != 8)]
test_s =  train[(train.locate == 2) | (train.locate == 4) | (train.locate == 5) | (train.locate == 6) | (train.locate == 8)]
#train_s = train[(train.locate != 4) & (train.locate != 6)]
#test_s =  train[(train.locate == 4) | (train.locate == 6)]

### 加工データの読み込み

In [3]:
train = pd.read_csv("./加工データセット/pattarn16.csv")
train.locate = train.locate.astype("int")
train.time = train.time.astype("int")
train.x = train.x.astype("int")
train.y = train.y.astype("int")
train.cluster = train.cluster.astype("int")
train.drop("Unnamed: 0",axis = 1)

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,x,y,cluster
0,-79.000000,-71.000000,-85.000000,-72.500000,9,1,0,0,0
1,-78.000000,-73.000000,-85.250000,-71.400000,10,1,0,0,0
2,-78.000000,-73.250000,-85.000000,-69.750000,11,1,0,0,0
3,-77.800000,-73.500000,-85.000000,-73.000000,12,1,0,0,0
4,-79.250000,-70.333333,-85.666667,-71.166667,13,1,0,0,0
5,-80.500000,-73.000000,-85.333333,-74.800000,14,1,0,0,0
6,-78.833333,-70.666667,-86.000000,-73.666667,15,1,0,0,0
7,-80.000000,-78.000000,-85.500000,-71.666667,16,1,0,0,0
8,-78.333333,-69.500000,-85.500000,-72.833333,17,1,0,0,0
9,-78.000000,-73.000000,-85.750000,-72.666667,18,1,0,0,0


In [None]:
train.columns[train[train == -79.250000].any()]

上記の表の説明をすると、x,yが0,0の時のそれぞれの受信機が計測した1秒ごとのRSSIの値が並んでいる。

### 学習用データセット1

In [None]:
#学習用とテスト用に分ける
#train_s = train[(train.locate != 2) & (train.locate != 4) & (train.locate != 6) & (train.locate != 8)]
#test_s =  train[(train.locate == 2) | (train.locate == 4) | (train.locate == 6) | (train.locate == 8)]
train_s = train[(train.locate != 2) & (train.locate != 4) & (train.locate != 5) & (train.locate != 6) & (train.locate != 8)]
test_s =  train[(train.locate == 2) | (train.locate == 4) | (train.locate == 5) | (train.locate == 6) | (train.locate == 8)]
#train_s = train[(train.locate != 1) & (train.locate != 2) & (train.locate != 4) & (train.locate != 6) & (train.locate != 8)]
#test_s =  train[(train.locate == 1) | (train.locate == 2) | (train.locate == 4) | (train.locate == 6) | (train.locate == 8)]
#train_s = train

### 学習用データセット2

In [4]:
train_s = test_s = train.sample(frac=1).reset_index(drop=True)
train_s = train_s.iloc[0:len(train)*3//4]
test_s =  test_s.iloc[len(train)*3//4:len(train)]

### 機械学習

In [5]:
from sklearn.utils import shuffle

# 学習用データセットを用意
#train_x = np.array(train_s[["dev1","dev2","dev3","dev4","cluster"]])
train_x = np.array(train_s[["dev1","dev2","dev4","cluster"]])
#train_x = np.array(train_s[["dev1","dev3","cluster"]])
train_y = np.array(train_s[["x","y"]])
train_x,train_y = shuffle(train_x,train_y)
#test_x = np.array(test_s[["dev1","dev2","dev3","dev4","cluster"]])
test_x = np.array(test_s[["dev1","dev2","dev4","cluster"]])
#test_x = np.array(test_s[["dev1","dev3","cluster"]])
test_y = np.array(test_s[["x","y"]])
test_x,test_y = shuffle(test_x,test_y)


# 教師データのx、yに分けたものを用意
train_y_x = np.array(train_s["x"])
test_y_x = np.array(test_s["x"])
train_y_y = np.array(train_s["y"])
test_y_y = np.array(test_s["y"])
train_y_x,train_y_y = shuffle(train_y_x,train_y_y)
test_y_x,test_y_y = shuffle(test_y_x,test_y_y)

In [None]:
train_x

### ランダムフォレスト

In [6]:
import sklearn
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=4,max_leaf_nodes=3,bootstrap=True,random_state=123,n_estimators=7)
#model = RandomForestRegressor()

reg = model.fit(train_x,train_y)
pred = reg.predict(test_x)

hoge = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y)],axis=1)
hoge.columns = ["pred_x","pred_y","answer_x","answer_y"]
print(hoge.pred_x.astype(int).unique(),hoge.answer_x.unique())
print(hoge.pred_y.astype(int).unique(),hoge.answer_y.unique())
hoge

[128   9 104 111  94  50 124  63] [100   0 200]
[ 82 131 120  89  96 126  87 125] [200 100   0]


Unnamed: 0,pred_x,pred_y,answer_x,answer_y
0,128.188931,82.322696,100,200
1,9.265714,131.772584,0,100
2,104.667751,120.878171,0,200
3,128.188931,82.322696,100,0
4,104.667751,120.878171,0,100
5,128.188931,82.322696,100,200
6,128.188931,82.322696,200,100
7,9.265714,131.772584,0,0
8,104.667751,120.878171,0,200
9,9.265714,131.772584,0,200


In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
ave2 = abs(hoge.pred_y - hoge.answer_y).mean()
yuku = np.sqrt(ave1**2 + ave2**2)
print("xの誤差の平均値:%s" % ave1)
print("yの誤差の平均値:%s" % ave2)
print("ユークリッド距離:%s" % yuku)

In [None]:
a = np.sqrt((hoge.pred_x - hoge.answer_x)**2 + (hoge.pred_y - hoge.answer_y)**2).astype(int).unique()
b = np.sqrt((hoge.pred_x - hoge.answer_x)**2 + (hoge.pred_y - hoge.answer_y)**2).astype(int)
a.sort()
for i in a:
    print(i,b[b == i].count())

In [None]:
print(b[b < 100].count())
print(b[b >= 100].count())

### 線形回帰

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression() 
reg = model.fit(train_x,train_y)
pred = reg.predict(test_x)

hoge = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y)],axis=1)
hoge.columns = ["pred_x","pred_y","answer_x","answer_y"]
hoge

In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
ave2 = abs(hoge.pred_y - hoge.answer_y).mean()
yuku = np.sqrt(ave1**2 + ave2**2)
print("xの誤差の平均値:%s" % ave1)
print("yの誤差の平均値:%s" % ave2)
print("ユークリッド距離:%s" % yuku)

### カーネルリッジ

In [None]:
from sklearn.kernel_ridge import KernelRidge

model = KernelRidge(alpha=1.0, kernel='rbf')

reg = model.fit(train_x,train_y)
pred = reg.predict(test_x)

hoge = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y)],axis=1)
hoge.columns = ["pred_x","pred_y","answer_x","answer_y"]
hoge

In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
ave2 = abs(hoge.pred_y - hoge.answer_y).mean()
yuku = np.sqrt(ave1**2 + ave2**2)
print("xの誤差の平均値:%s" % ave1)
print("yの誤差の平均値:%s" % ave2)
print("ユークリッド距離:%s" % yuku)

### xgboost

In [None]:
#xの予測
import xgboost as xgb
model = xgb.XGBRegressor(max_depth=2)
reg = model.fit(train_x,train_y_x)
pred = reg.predict(test_x)

hoge1 = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y_x)],axis=1)
hoge1.columns = ["pred_x","answer_x"]

In [None]:
#yの予測
import xgboost as xgb
model = xgb.XGBRegressor(max_depth=4)
reg = model.fit(train_x,train_y_y)
pred = reg.predict(test_x)

hoge2 = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y_y)],axis=1)
hoge2.columns = ["pred_y","answer_y"]
#hoge2.pred_y.astype(int).unique()

In [None]:
hoge1

In [None]:
ave1 = abs(hoge1.pred_x - hoge1.answer_x).mean()
ave2 = abs(hoge2.pred_y - hoge2.answer_y).mean()
yuku = np.sqrt(ave1**2 + ave2**2)
print("xの誤差の平均値:%s" % ave1)
print("yの誤差の平均値:%s" % ave2)
print("ユークリッド距離:%s" % yuku)

In [None]:
a = np.sqrt((hoge1.pred_x - hoge1.answer_x)**2 + (hoge2.pred_y - hoge2.answer_y)**2).astype(int).unique()
b = np.sqrt((hoge1.pred_x - hoge1.answer_x)**2 + (hoge2.pred_y - hoge2.answer_y)**2).astype(int)
a.sort()
for i in a:
    print(i,b[b == i].count())

In [None]:
print(b[(b < 100)].count())
print(b[b >= 100].count())

### ロジスティック回帰

In [None]:
from sklearn.linear_model import LogisticRegression
 
# 訓練
model = LogisticRegression()
reg = model.fit(train_x,train_y_x)
pred = reg.predict(test_x)

hoge = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y_x)],axis=1)
hoge.columns = ["pred_x","answer_x"]
hoge

In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
print("xの誤差の平均値:%s" % ave1)

### SVR

In [None]:
from sklearn import svm
model = svm.SVR(kernel="rbf")
reg = model.fit(train_x,train_y_x)
pred = reg.predict(test_x)

hoge = pd.concat([pd.Series(pred),pd.Series(test_y_x)],axis=1)
hoge.columns = ["pred_x","answer_x"]
hoge

In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
print("xの誤差の平均値:%s" % ave1)

### リッジ回帰

In [None]:
model = linear_model.Ridge(alpha=1.0) 
reg = model.fit(train_x,train_y)
pred = reg.predict(test_x)

hoge = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y)],axis=1)
hoge.columns = ["pred_x","pred_y","answer_x","answer_y"]
hoge

In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
ave2 = abs(hoge.pred_y - hoge.answer_y).mean()
yuku = np.sqrt(ave1**2 + ave2**2)
print("xの誤差の平均値:%s" % ave1)
print("yの誤差の平均値:%s" % ave2)
print("ユークリッド距離:%s" % yuku)

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn import grid_search

#ニューラルネットワークの隠れ層の候補をいろいろ定義
parameters = {'hidden_layer_sizes' : [(100,), (100, 10), (100, 100, 10), (100, 100, 100, 10)]}

#ニューラルネットワークのベストな隠れ層を探索
model = grid_search.GridSearchCV(MLPRegressor(), parameters)
reg = model.fit(train_x,train_y)
pred = reg.predict(test_x)

hoge = pd.concat([pd.DataFrame(pred),pd.DataFrame(test_y)],axis=1)
hoge.columns = ["pred_x","pred_y","answer_x","answer_y"]
hoge.pred_x

In [None]:
ave1 = abs(hoge.pred_x - hoge.answer_x).mean()
ave2 = abs(hoge.pred_y - hoge.answer_y).mean()
yuku = np.sqrt(ave1**2 + ave2**2)
print("xの誤差の平均値:%s" % ave1)
print("yの誤差の平均値:%s" % ave2)
print("ユークリッド距離:%s" % yuku)