In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['font.family'] = 'IPAexGothic' #全体のフォントを設定
plt.rcParams["figure.figsize"] = [20, 12] # グラフのサイズを指定
plt.rcParams['font.size'] = 20 #フォントサイズを設定 default : 12
plt.rcParams['xtick.labelsize'] = 15 # 横軸のフォントサイズ
plt.rcParams['ytick.labelsize'] = 15 # 縦軸のフォントサイズ

### データの前処理

In [2]:
# パターン13の各受信機データを取得
df13_1 = pd.read_csv("./取得データセット/受信機1/rasp1_pattern13.csv")
df13_2 = pd.read_csv("./取得データセット/受信機2/nkmr2-dataset13.csv")
df13_3 = pd.read_csv("./取得データセット/受信機3/rasp3_pattern13.csv")
df13_4 = pd.read_csv("./取得データセット/受信機4/nkmr4-dataset13.csv")
df13_2 = df13_2.drop("\\N",axis=1)

# それぞれのカラム名の指定
df13_1.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp"]
df13_2.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp"]
df13_3.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp"]
df13_4.columns = ["UUIDs","Major","Minor","Accuracy","Proximity","TxPower","RSSI","device_name","timestamp"]

# データの結合
df = df13_1.append(df13_2).append(df13_3).append(df13_4)

# 時刻合わせ用ビーコン
set_uuid = "1000000007ae1001b000001c4d8dffff"

# 発信機ごとにデータ分け
df_1 = df[(df['UUIDs']  == "10000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]
df_2 = df[(df['UUIDs']  == "20000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]
df_3 = df[(df['UUIDs']  == "30000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]
df_4 = df[(df['UUIDs']  == "40000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]
df_5 = df[(df['UUIDs']  == "50000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]
df_6 = df[(df['UUIDs']  == "60000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]
df_7 = df[(df['UUIDs']  == "70000000aaaa41441111110000000000") | (df['UUIDs'] == set_uuid)]

# リストに保存
df_list = [df_1,df_2,df_3,df_4,df_5,df_6,df_7]

### 時刻合わせを行う

In [3]:
#　オブジェクト型の目的変数を数値情報に変換
import sklearn.preprocessing as sp

# 前処理用のインスタンスを宣言
le = sp.LabelEncoder()

def set_time(data,set_uuid):
    """
    時刻合わせを行う
    第一引数はdataframe
    第２引数は時刻合わせを行うUUID
    """
    # 時刻合わせ用のビーコンを1秒間隔に設定
    set_time_df = data[data.UUIDs == set_uuid].drop_duplicates(['Minor'],keep = "first")
    # 時刻合わせ用ビーコン以外を定義
    df = data[data.UUIDs != set_uuid]
    # データを結合してインデックスでソート
    df = df.append(set_time_df).sort_index(ascending=True)
    # 時刻合わせ用ビーコンを取得した行にtimeカラムを設定
    df["time"] = df["Minor"].where(df["UUIDs"] == set_uuid)
    # 時刻合わせを行い欠損値がある部分は削除
    df = df.fillna(method="ffill").dropna()
    # int型に変換
    df["time"] = df["time"].astype(int)
    # 時刻合わせ用のビーコンデータ以外
    df = df[df["UUIDs"] != set_uuid]
    
    return df

# 前処理を各データフレームに行う
for (i,df) in enumerate(df_list):
    df_list[i] = set_time(df,set_uuid)
    # device_nameを変換
    df_list[i]["locate"] = i + 1
    # device_nameを変換
    df_list[i]["device"] = le.fit_transform(df_list[i][df_list[i].columns[7]]) + 1

In [4]:
df_list[0]

Unnamed: 0,UUIDs,Major,Minor,Accuracy,Proximity,TxPower,RSSI,device_name,timestamp,time,locate,device
10,10000000aaaa41441111110000000000,2,38,1.154145,near,-78,-81,rasp1,2017/12/06 10:19:51,1,1,3
17,10000000aaaa41441111110000000000,2,38,1.465642,near,-78,-86,nkmr-2,2017/12/06 10:08:45,1,1,1
17,10000000aaaa41441111110000000000,2,38,1.154145,near,-78,-81,rasp1,2017/12/06 10:19:51,1,1,3
22,10000000aaaa41441111110000000000,2,38,1.465642,near,-78,-86,rasp1,2017/12/06 10:19:52,1,1,3
26,10000000aaaa41441111110000000000,2,38,1.465642,near,-78,-86,nkmr-2,2017/12/06 10:08:46,1,1,1
29,10000000aaaa41441111110000000000,2,38,1.691564,near,-78,-89,rasp3,2017/12/06 10:39:54,1,1,4
30,10000000aaaa41441111110000000000,2,38,1.691564,near,-78,-89,nkmr-4,2017/12/06 10:39:57,1,1,2
39,10000000aaaa41441111110000000000,2,38,1.691564,near,-78,-89,nkmr-4,2017/12/06 10:39:57,2,1,2
42,10000000aaaa41441111110000000000,2,38,1.154145,near,-78,-81,rasp1,2017/12/06 10:19:52,2,1,3
44,10000000aaaa41441111110000000000,2,38,1.774360,near,-78,-90,rasp3,2017/12/06 10:39:54,2,1,4


### 入射波と反射波を判別する

In [5]:
# KMeansライブラリをインポート
from sklearn.cluster import KMeans

In [6]:
def three_d_mc(df_list,column,*,N_CLUSTERS=2):
    """3次元の入力に対するクラスタリングを行う"""
    
    # 空のデータフレームを宣言
    df_cluster = pd.DataFrame()
    return_list = []
    
    for df in df_list: # 入力されたデータフレームの数だけ
        # 空のデータフレームを作成
        hoge = pd.DataFrame()
        for num in range(len(df["device"].unique())): # 受信機の数だけ
            # 受信機ごとにデータフレームを複製
            device_df = df[df["device"] == num+1]
            # クラスタリング
            pred = KMeans(n_clusters=N_CLUSTERS).fit_predict(np.array(device_df[column]))
            # clusterカラムを作成し予測結果を代入
            device_df.loc[:,("cluster")] = pred
            # RSSI値の平均値を比較して入射波と反射波の判定を行う
            if device_df[device_df["cluster"] == 0]["RSSI"].mean() > device_df[device_df["cluster"] == 1]["RSSI"].mean():
                pass
            else:
                device_df[device_df["cluster"] == 0]["device"] = 3
                device_df[device_df["cluster"] == 1]["device"] = 0
                device_df[device_df["cluster"] == 3]["device"] = 1
            
            # データを連結
            hoge = pd.concat([hoge,device_df],ignore_index=True)
        # 予測結果をリストにして返す
        return_list.append(hoge)
    return return_list

In [7]:
df_list = three_d_mc(df_list,["RSSI","device"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

In [8]:
df_list[1]

Unnamed: 0,UUIDs,Major,Minor,Accuracy,Proximity,TxPower,RSSI,device_name,timestamp,time,locate,device,cluster
0,20000000aaaa41441111110000000000,2,40,1.691564,near,-78,-89,nkmr-2,2017/12/06 10:08:45,1,2,1,0
1,20000000aaaa41441111110000000000,2,40,1.332051,near,-78,-84,nkmr-2,2017/12/06 10:08:45,1,2,1,1
2,20000000aaaa41441111110000000000,2,40,1.269894,near,-78,-83,nkmr-2,2017/12/06 10:08:45,1,2,1,1
3,20000000aaaa41441111110000000000,2,40,1.612630,near,-78,-88,nkmr-2,2017/12/06 10:08:46,1,2,1,0
4,20000000aaaa41441111110000000000,2,40,1.691564,near,-78,-89,nkmr-2,2017/12/06 10:08:46,1,2,1,0
5,20000000aaaa41441111110000000000,2,40,1.952310,near,-78,-92,nkmr-2,2017/12/06 10:08:46,1,2,1,0
6,20000000aaaa41441111110000000000,2,40,1.612630,near,-78,-88,nkmr-2,2017/12/06 10:08:46,2,2,1,0
7,20000000aaaa41441111110000000000,2,40,1.952310,near,-78,-92,nkmr-2,2017/12/06 10:08:47,2,2,1,0
8,20000000aaaa41441111110000000000,2,40,1.691564,near,-78,-89,nkmr-2,2017/12/06 10:08:47,2,2,1,0
9,20000000aaaa41441111110000000000,2,40,1.332051,near,-78,-84,nkmr-2,2017/12/06 10:08:48,3,2,1,1


### 時刻毎に平均値をとり新たにデータフレームを作成

In [9]:
def get_mean(df,colum_name,locate):    
    """
    処理時間かかります。
    時刻毎の中央値をとる
    第一引数、データフレーム
    第２引数、作成したいカラム名
    """
    # 空のデータフレームとリストを用意
    Direct_list = []
    Reflect_list = []
    mid_df = pd.DataFrame()
    
    # timeのユニーク数だけ
    for time_num in range(len(df_list[0]["time"].unique())): 
        # 受信機の数だけ
        for i in range(len(df_list[0]["device"].unique())): 
            # timeラベルが同じ値に対して平均値をとる
            Direct = df[(df["time"] == time_num) & (df["device"] == (i+1)) & (df["cluster"] == 0)]["RSSI"].mean()
            Reflect = df[(df["time"] == time_num) & (df["device"] == (i+1)) & (df["cluster"] == 1)]["RSSI"].mean()
            # 平均値をリストに追加する
            Direct_list.append(Direct)
            Reflect_list.append(Reflect)
        # 平均値を追加したリストに(時間、地点、クラスタ番号を追加)
        Direct_list.extend([time_num,locate,0])
        Reflect_list.extend([time_num,locate,1])
        # データフレームに各要素を追加
        mid_df = mid_df.append(pd.Series(Direct_list),ignore_index=True)
        mid_df = mid_df.append(pd.Series(Reflect_list),ignore_index=True)
        # 平均値のリストを初期化
        Direct_list = []
        Reflect_list = []
    # カラム名を定義
    mid_df.columns = [colum_name]
    
    return mid_df

#前処理を各データフレームと地点毎に行う
train_df = pd.DataFrame()
column_name = ["dev1","dev2","dev3","dev4","time","locate","cluster"]
for (locate,df) in enumerate(df_list):
    mid_df = get_mean(df,column_name,(locate+1))
    train_df = pd.concat([train_df,mid_df])

In [10]:
train_df

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,cluster
0,,,,,0.0,1.0,0.0
1,,,,,0.0,1.0,1.0
2,,-89.0,-81.000000,-89.000000,1.0,1.0,0.0
3,-86.0,,-86.000000,,1.0,1.0,1.0
4,,-89.0,-81.000000,-90.000000,2.0,1.0,0.0
5,-87.0,,-86.000000,,2.0,1.0,1.0
6,,,-81.000000,-90.000000,3.0,1.0,0.0
7,,,-86.000000,,3.0,1.0,1.0
8,,-89.0,-81.000000,-90.000000,4.0,1.0,0.0
9,-87.0,-90.0,-86.000000,,4.0,1.0,1.0


この方法だと欠損値の数が多すぎる。また、保管も難しい。

### 欠損値の補完

In [12]:
# 欠損値を削除
mid_df = mid_df.dropna()
mid_df

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,cluster
9,-86.000000,-86.000000,-93.000000,-88.333333,4.0,7.0,1.0
10,-84.500000,-81.400000,-91.000000,-82.500000,5.0,7.0,0.0
15,-86.000000,-88.000000,-92.000000,-88.000000,7.0,7.0,1.0
20,-84.500000,-82.000000,-90.250000,-82.000000,10.0,7.0,0.0
22,-84.500000,-82.166667,-90.000000,-83.000000,11.0,7.0,0.0
24,-85.000000,-81.750000,-89.500000,-83.000000,12.0,7.0,0.0
26,-84.000000,-82.000000,-88.000000,-83.000000,13.0,7.0,0.0
51,-86.000000,-87.500000,-93.000000,-89.000000,25.0,7.0,1.0
64,-85.000000,-82.250000,-90.000000,-83.000000,32.0,7.0,0.0
67,-86.000000,-87.000000,-92.000000,-89.000000,33.0,7.0,1.0


欠損値がある行を削除するとデータ数は500以下。どうにかして欠損値の保管をする必要がある？

In [81]:
train_df[train_df.iloc[:,6] == 0]

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,cluster
0,,,,,0.0,1.0,0.0
2,,-89.000000,-81.000000,-89.000000,1.0,1.0,0.0
4,,-89.000000,-81.000000,-90.000000,2.0,1.0,0.0
6,,,-81.000000,-90.000000,3.0,1.0,0.0
8,,-89.000000,-81.000000,-90.000000,4.0,1.0,0.0
10,,-89.000000,-81.000000,-90.333333,5.0,1.0,0.0
12,,-89.000000,-81.250000,-89.333333,6.0,1.0,0.0
14,,-89.000000,-82.500000,-90.333333,7.0,1.0,0.0
16,,-89.000000,-82.000000,-89.500000,8.0,1.0,0.0
18,,-89.000000,-83.200000,-90.000000,9.0,1.0,0.0


クラスタが直接波のデータを見る。欠損値の数は相変わらず多い。

In [85]:
train_df.dropna(thresh=4)

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,cluster
2,,-89.0,-81.000000,-89.000000,1.0,1.0,0.0
3,-86.0,,-86.000000,,1.0,1.0,1.0
4,,-89.0,-81.000000,-90.000000,2.0,1.0,0.0
5,-87.0,,-86.000000,,2.0,1.0,1.0
6,,,-81.000000,-90.000000,3.0,1.0,0.0
7,,,-86.000000,,3.0,1.0,1.0
8,,-89.0,-81.000000,-90.000000,4.0,1.0,0.0
9,-87.0,-90.0,-86.000000,,4.0,1.0,1.0
10,,-89.0,-81.000000,-90.333333,5.0,1.0,0.0
11,-87.0,-90.0,-87.000000,,5.0,1.0,1.0


欠損していない列が4つの行を表示。データ数は確保できるが相変わらず欠損値が多い。

In [89]:
train_df.dropna(thresh=6)

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,cluster
2,,-89.000000,-81.000000,-89.000000,1.0,1.0,0.0
4,,-89.000000,-81.000000,-90.000000,2.0,1.0,0.0
8,,-89.000000,-81.000000,-90.000000,4.0,1.0,0.0
9,-87.000000,-90.000000,-86.000000,,4.0,1.0,1.0
10,,-89.000000,-81.000000,-90.333333,5.0,1.0,0.0
11,-87.000000,-90.000000,-87.000000,,5.0,1.0,1.0
12,,-89.000000,-81.250000,-89.333333,6.0,1.0,0.0
14,,-89.000000,-82.500000,-90.333333,7.0,1.0,0.0
15,-87.000000,-90.000000,-86.666667,,7.0,1.0,1.0
16,,-89.000000,-82.000000,-89.500000,8.0,1.0,0.0


何パターンか試した結果欠損値を許す列の値が１つの場合がデータ数、欠損値の量を考慮すると良い。

In [92]:
direct = train_df[train_df.iloc[:,6] == 0].dropna(thresh=6).interpolate().dropna()
reflect = train_df[train_df.iloc[:,6] == 1].dropna(thresh=6).interpolate().dropna()
train = pd.concat([direct,reflect])
train

Unnamed: 0,dev1,dev2,dev3,dev4,time,locate,cluster
22,-88.000000,-89.000000,-83.000000,-90.500000,11.0,1.0,0.0
24,-88.000000,-89.000000,-84.000000,-90.500000,12.0,1.0,0.0
26,-88.000000,-88.500000,-83.000000,-90.000000,13.0,1.0,0.0
28,-88.000000,-88.750000,-82.200000,-89.000000,14.0,1.0,0.0
30,-88.000000,-89.000000,-82.000000,-90.500000,15.0,1.0,0.0
32,-88.000000,-89.000000,-82.333333,-90.000000,16.0,1.0,0.0
34,-88.000000,-89.000000,-81.000000,-90.000000,17.0,1.0,0.0
36,-88.000000,-89.000000,-83.000000,-90.000000,18.0,1.0,0.0
38,-88.250000,-89.000000,-81.000000,-90.000000,19.0,1.0,0.0
44,-88.500000,-89.000000,-81.600000,-90.500000,22.0,1.0,0.0


欠損値処理後のデータセット。前後の値を用いて線形補間を行った。

### 学習用のデータフレームを作成

In [94]:
train_x = np.array(train[["dev1","dev2","dev3","dev4","time","cluster"]])
train_y = np.array(train["locate"])

### 学習して予測まで

In [99]:
import xgboost as xgb
from sklearn.model_selection import KFold

# 検定を行うデータを定義
X = train_x
y = train_y.ravel()
kf = KFold(n_splits=5,shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # オブジェクトを作成し学習
    mod = xgb.XGBClassifier()
    clf = mod.fit(X_train,y_train)
    score = clf.score(X_test,y_test)
    print("精度 : %s" % score)

精度 : 0.99945622621
精度 : 1.0
精度 : 0.998911860718
精度 : 0.998367791077
精度 : 0.997823721436


非常に高い精度が出た！！。でも何か怪しい、、