## Préparation des données CGU-BES Dataset

In [60]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join   
import csv                         # csv files
import warnings                    # Warnings Library


warnings.filterwarnings('ignore')  # delete all warnings in python


def read_filenames(path):
    '''
     Read all names files in folder
    :path: file path
    :return: files names
    '''
    files = sorted([f for f in listdir(path) if isfile(join(path, f))])
    filenames = [files[k] for k in range(1,len(files)-1)]
    return filenames

def read_names(filenames,sep,col):
    '''
     Read the names without the extention .txt or others
    :params: filenames 
    :return: names
    '''
    names = [filenames[k].split(sep)[col] for k in range(len(filenames))]
    return names

def file_to_csv(files, path_in, path_out):
    '''
     Convert any file to file.csv
    '''
    
    with open(path_in+'/'+files + '.txt', 'r') as in_file:                  
        stripped = (line.strip() for line in in_file)
        lines = (line.split(",") for line in stripped if line)
        
        with open(path_out+'/'+files + '.csv', 'w') as out_file:
            writer = csv.writer(out_file)
            writer.writerows(lines)

            

In [4]:
path_in = "/Users/davidjeannette/Desktop/Alex_Project/CGU-BES Dataset" 
path_out = "/Users/davidjeannette/Desktop/Alex_Project/CGU-BES Dataset CSV" 

filenames = read_filenames(path_in)
files_ = read_names(filenames,sep='.',col=0)

for file in files_:
    file_to_csv(file, path_in, path_out)

files = list(dict.fromkeys(files_)) # remove dupliucate names of file


In [5]:
doc1= [pd.read_csv(path_out+'/'+ file + '.csv',';') for file in files]  # list of dataframe


In [6]:
# 1. Récupération de index des colonnes du dataframe

cols = doc1[0].loc[2][0].split(',')  # ['AccX', ' AccY', ' AccZ', ' GyroX', ' GyroY', ' GyroZ']
cols

['AccX', ' AccY', ' AccZ', ' GyroX', ' GyroY', ' GyroZ']

In [7]:
# 2. Création d'une lieste de dataframe de valeurs des senseurs 

docv = []
y_ = []
for l in range(len(doc1)):
    for s in range(4,len(doc1[l])):
        docv.append([float(doc1[l].loc[s][0].split(',')[i]) for i in range(len(cols))])
        y_.append(doc1[l].loc[1][0].split(',')[0])

In [8]:
# 3. visualisation du dataframe

df1= pd.DataFrame(data=docv, columns=cols)
y = pd.DataFrame(data=y_, columns=['Y'])
df1.head()

Unnamed: 0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ
0,-0.084249,0.875458,0.533578,0.238891,-0.022853,-0.052175
1,-0.076923,0.8779,0.52381,0.226904,-0.030845,-0.064162
2,-0.074481,0.89011,0.528694,0.230899,-0.030845,-0.056171
3,-0.069597,0.894994,0.521368,0.218912,-0.03484,-0.060167
4,-0.072039,0.90232,0.52381,0.218912,-0.038836,-0.060167


In [9]:
# 4. Convertir la liste de listes de dataframe en un seul dataFrame

df = pd.concat([df1, y], axis=1)      # concatenate dataframe target Y and sensors
df.head()                             # final dataframe

Unnamed: 0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ,Y
0,-0.084249,0.875458,0.533578,0.238891,-0.022853,-0.052175,BackwardFall
1,-0.076923,0.8779,0.52381,0.226904,-0.030845,-0.064162,BackwardFall
2,-0.074481,0.89011,0.528694,0.230899,-0.030845,-0.056171,BackwardFall
3,-0.069597,0.894994,0.521368,0.218912,-0.03484,-0.060167,BackwardFall
4,-0.072039,0.90232,0.52381,0.218912,-0.038836,-0.060167,BackwardFall


In [10]:
# Convertir le dataframe en fichier csv

path="/Users/davidjeannette/Desktop/Alex_Project/notebooks/clean_data.csv"
df.to_csv(path) # DataFrame to csv
print('--- done ---')

--- done ---


## Preparation des données SisFall datatset


Dans cette section, nous expliquons comment préparer les données du jeu de données: 

---- SisFall: A Fall and Movement Dataset ----

Créé par:
A. Sucerquia, J.D. LÛpez, J.F. Vargas-Bonilla
SISTEMIC, Faculty of Engineering, Universidad de Antiquia UDEA
josedavid@udea.edu.co, February 2016 - Version 1.0

### 1. Liste des noms des dossiers d'enregistrements

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join   
import csv                         # csv files
import warnings                    # Warnings Library


warnings.filterwarnings('ignore')  # delete all warnings in python

# Lecture des dossiers  

path_in = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset" 

# liste des noms des dossiers ordonnés 

foldernames = sorted(listdir(path_in))

### 2. Creation des dossiers vides pour le stockage des fichires d'enregistrements 

In [4]:
# importing os module
import os

# Parent Directories
parent_dir = "/Users/davidjeannette/Desktop/Alex_Project"
 
# Leaf directory
directory = "SisFall_dataset_CSV"

# join Path
path = os.path.join(parent_dir, directory)

# Create the directory

os.makedirs(path)
print("Directory '% s' created" % directory)

# Folders in the leaf directory 

for i in range(len(foldernames)):
    path_ = os.path.join(path,foldernames[i]+'_'+'csv')
    os.makedirs(path_)
    
print(" All folders have been created ")

Directory 'SisFall_dataset_CSV' created
 All folders have been created 


### 3. Convertir les fichiers .txt en fichires .csv

In [5]:
# Fonction pour la lecture des fichiers .txt et convertir en fichiers .csv

def txt_to_csv(path_in,path_out,folder_in,folder_out,file_in, file_out):
    
    with open(path_in+"/"+folder_in+"/"+file_in,'r') as in_file:
        
        for txt in in_file:
            s = (txt.strip() for txt in in_file)                          
            lines = (txt.split(",") for txt in in_file)                       
            
            with open(path_out+"/"+folder_out+"/"+file_out, 'w') as out_file:
                writer = csv.writer(out_file)
                writer.writerows(lines)


In [6]:
# recupération des noms des fichiers à l'interieur des dossiers

filenames = [sorted(listdir(path_in+'/'+foldernames[k])) for k in range(len(foldernames))]

no_folder = len(foldernames)
no_file = len(filenames)

In [7]:
path_in = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset"
path_out = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset_CSV"


for folder_value in range(len(foldernames)):
    
    folder_in = foldernames[folder_value]
    folder_out = folder_in + '_' + 'csv'
    
    for file_value in range(len(filenames[folder_value])):
        
        file_in = filenames[folder_value][file_value]
        file_out = file_in.split(".")[0]+".csv"
    
        txt_to_csv(path_in,path_out,folder_in,folder_out,file_in,file_out)
        
print("-- done --")

-- done --


### 4. Convertir un fichier .csv en dataframe

In [8]:
# Chargement du fichier .csv en dataframe

path_in = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset_CSV/SA01_csv/D01_SA01_R01.csv"

df1 = pd.read_csv(path_in,
                 names=['Acc_X', 'Acc_Y', 'Acc_Z', 'Gyr_X', 'Gyr_Y', 'Gyr_Z', 'Mag_X', 'Mag_Y', 'Mag_Z'])

In [9]:
# Visualisation du dataframe

df1.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z
0,15,-174,-90,-53,-568,-306,48,-675,-254;\n
1,1,-176,-81,-84,-613,-271,-2,-668,-221;\n
2,-10,-180,-77,-104,-647,-227,-34,-697,-175;\n
3,-21,-191,-63,-128,-675,-191,-74,-741,-133;\n
4,-37,-225,-59,-146,-700,-159,-110,-840,-103;\n


**commentaires:**

- Nous pouvons remarquer que la dernière colonne Mag_Z doit être nettoyer et transformer en un entier.

In [10]:
# Nettoyage de caractères inutiles de la colonne  Mag_Z

df1['Mag_Z'] = df1['Mag_Z'].str.split(";\n").str.get(0)

In [11]:
# Visualisation du dataframe

df1.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z
0,15,-174,-90,-53,-568,-306,48,-675,-254
1,1,-176,-81,-84,-613,-271,-2,-668,-221
2,-10,-180,-77,-104,-647,-227,-34,-697,-175
3,-21,-191,-63,-128,-675,-191,-74,-741,-133
4,-37,-225,-59,-146,-700,-159,-110,-840,-103


In [12]:
# Convertir les éléments de la colonne Mag_Z en entier

df1['Mag_Z'] = df1['Mag_Z'].astype(int)

In [13]:
# Ou convertir tous les éléments du dataframe en entier

df1=df1.astype(int)

In [14]:
# Visualisation du dataframe

df1.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z
0,15,-174,-90,-53,-568,-306,48,-675,-254
1,1,-176,-81,-84,-613,-271,-2,-668,-221
2,-10,-180,-77,-104,-647,-227,-34,-697,-175
3,-21,-191,-63,-128,-675,-191,-74,-741,-133
4,-37,-225,-59,-146,-700,-159,-110,-840,-103


In [15]:
# Visualisation des types de éléments du dataframe

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19998 entries, 0 to 19997
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Acc_X   19998 non-null  int64
 1   Acc_Y   19998 non-null  int64
 2   Acc_Z   19998 non-null  int64
 3   Gyr_X   19998 non-null  int64
 4   Gyr_Y   19998 non-null  int64
 5   Gyr_Z   19998 non-null  int64
 6   Mag_X   19998 non-null  int64
 7   Mag_Y   19998 non-null  int64
 8   Mag_Z   19998 non-null  int64
dtypes: int64(9)
memory usage: 1.4 MB


### 5. Concatenation de plusieurs dataframes

In [16]:
# Chargement du fichier .csv en dataframe

path_in = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset_CSV/SA01_csv/D01_SA01_R01.csv"

df2 = pd.read_csv(path_in,
                 names=['Acc_X', 'Acc_Y', 'Acc_Z', 'Gyr_X', 'Gyr_Y', 'Gyr_Z', 'Mag_X', 'Mag_Y', 'Mag_Z'])

df2['Mag_Z'] = df2['Mag_Z'].str.split(";\n").str.get(0)
df2['Mag_Z'] = df2['Mag_Z'].astype(int)

In [17]:
df2.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z
0,15,-174,-90,-53,-568,-306,48,-675,-254
1,1,-176,-81,-84,-613,-271,-2,-668,-221
2,-10,-180,-77,-104,-647,-227,-34,-697,-175
3,-21,-191,-63,-128,-675,-191,-74,-741,-133
4,-37,-225,-59,-146,-700,-159,-110,-840,-103


In [18]:
# Lecture des dossiers  

path_in = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset_CSV" 

# liste des noms des dossiers ordonnés 

folder_ = sorted(listdir(path_in))

In [19]:
# Initialisation du dataframe
df = pd.DataFrame()

# Index des colonnes
names=['Acc_X', 'Acc_Y', 'Acc_Z', 'Gyr_X', 'Gyr_Y', 'Gyr_Z', 'Mag_X', 'Mag_Y', 'Mag_Z']

# Chemin du dossier principal
path_in = "/Users/davidjeannette/Desktop/Alex_Project/SisFall_dataset_CSV" 

# liste des noms des dossiers ordonnés 
foldernames = sorted(listdir(path_in))

# recupération des noms des fichiers à l'interieur des dossiers
filenames = [sorted(listdir(path_in+'/'+foldernames[k])) for k in range(len(foldernames))]

In [21]:

for folder_value in range(len(foldernames)):
     
    folder = foldernames[folder_value]
    print(folder)
    
    for file_value in range(len(filenames[folder_value])):
        #Chargement du fichier csv
        file_csv = filenames[folder_value][file_value]
        path_ = path_in+'/'+folder+'/'+file_csv
        df_ = pd.read_csv(path_, names=names)
        df_['Mag_Z'] = df_['Mag_Z'].str.split(";\n").str.get(0)
        df_['Mag_Z'] = df_['Mag_Z'].astype(int)
        
        # Genérer le vecteur de valeurs à prédire (i.e. catégorielles) -- Fall (F) or Daily (D)
        df_["y"] = pd.DataFrame([file_csv.split("_")[0][0] for k in range(df_.shape[0])], columns=["y"])
        
        # Concatenation de df et df_
        df = pd.concat([df,df_])

print("-- done --")

SA01_csv
SA02_csv
SA03_csv
SA04_csv
SA05_csv
SA06_csv
SA07_csv
SA08_csv
SA09_csv
SA10_csv
SA11_csv
SA12_csv
SA13_csv
SA14_csv
SA15_csv
SA16_csv
SA17_csv
SA18_csv
SA19_csv
SA20_csv
SA21_csv
SA22_csv
SA23_csv
SE01_csv
SE02_csv
SE03_csv
SE04_csv
SE05_csv
SE06_csv
SE07_csv
SE08_csv
SE09_csv
SE10_csv
SE11_csv
SE12_csv
SE13_csv
SE14_csv
SE15_csv
-- done --


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15854424 entries, 0 to 4998
Data columns (total 10 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Acc_X   int64 
 1   Acc_Y   int64 
 2   Acc_Z   int64 
 3   Gyr_X   int64 
 4   Gyr_Y   int64 
 5   Gyr_Z   int64 
 6   Mag_X   int64 
 7   Mag_Y   int64 
 8   Mag_Z   int64 
 9   y       object
dtypes: int64(9), object(1)
memory usage: 1.3+ GB


**Commentraires:**

- Dataframe qui a environ 16 millions d'éléments et qui occupe 1.3 gigaoctets.
- 8 variables numériques et 1 variable catégorielle (variable à prédire Fall (F) or Daily (D))

### 6. Statistiques

In [23]:
df.describe()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z
count,15854420.0,15854420.0,15854420.0,15854420.0,15854420.0,15854420.0,15854420.0,15854420.0,15854420.0
mean,-1.239098,-178.7246,-27.72918,-9.714939,35.17121,-4.465268,-18.90055,-700.4533,-59.38631
std,105.0205,149.9506,124.8955,590.9351,489.5622,398.1916,408.6846,582.8072,506.1571
min,-4096.0,-4096.0,-4096.0,-32303.0,-15491.0,-13150.0,-8192.0,-8192.0,-8192.0
25%,-22.0,-253.0,-92.0,-76.0,-18.0,-33.0,-97.0,-990.0,-317.0
50%,2.0,-230.0,-31.0,-11.0,38.0,-5.0,1.0,-905.0,-68.0
75%,26.0,-54.0,33.0,65.0,95.0,23.0,82.0,-209.0,185.0
max,4094.0,4092.0,4091.0,32767.0,15819.0,13798.0,8191.0,8191.0,8191.0


**Commentaires:**

- Écarts significatifs entre les différentes moyennes --> normalisation des données pour la modélisation 


In [24]:
## Convertir le dataframe en fichier csv

df.to_csv("/Users/davidjeannette/Desktop/Alex_Project/notebooks/out1.csv") 

print("--- done ---")

--- done ---


In [25]:
## Visualisation du Dataframe final

df.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,y
0,15,-174,-90,-53,-568,-306,48,-675,-254,D
1,1,-176,-81,-84,-613,-271,-2,-668,-221,D
2,-10,-180,-77,-104,-647,-227,-34,-697,-175,D
3,-21,-191,-63,-128,-675,-191,-74,-741,-133,D
4,-37,-225,-59,-146,-700,-159,-110,-840,-103,D


In [26]:
# Visualisation des classes à prédire

pd.DataFrame(df['y'].value_counts())

Unnamed: 0,y
D,10462508
F,5391916


**Commentaires:**

- Classe à prédire non-équilibré --> ne pas utiliser la métrique accuracy


Prochaine étape la modélisation

In [51]:
df = pd.read_csv("/Users/davidjeannette/Desktop/Alex_Project/notebooks/out1.csv")

In [54]:
df['y'] = pd.get_dummies(df['y']).drop('D',axis=1)

# D = 0 et F = 1

In [55]:
df.head()

Unnamed: 0.1,Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,y
0,0,15,-174,-90,-53,-568,-306,48,-675,-254,0
1,1,1,-176,-81,-84,-613,-271,-2,-668,-221,0
2,2,-10,-180,-77,-104,-647,-227,-34,-697,-175,0
3,3,-21,-191,-63,-128,-675,-191,-74,-741,-133,0
4,4,-37,-225,-59,-146,-700,-159,-110,-840,-103,0


In [56]:
pd.DataFrame(df['y'].value_counts())

Unnamed: 0,y
0,10462508
1,5391916


In [57]:
df = df.drop('Unnamed: 0',axis=1)

In [58]:
df.head()

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,y
0,15,-174,-90,-53,-568,-306,48,-675,-254,0
1,1,-176,-81,-84,-613,-271,-2,-668,-221,0
2,-10,-180,-77,-104,-647,-227,-34,-697,-175,0
3,-21,-191,-63,-128,-675,-191,-74,-741,-133,0
4,-37,-225,-59,-146,-700,-159,-110,-840,-103,0


In [59]:
## Convertir le dataframe en fichier csv

df.to_csv("/Users/davidjeannette/Desktop/Alex_Project/notebooks/out2.csv") 

print("--- done ---")

--- done ---


In [None]:
# We create two lists to keep training and test accuracies. 
training_accuracy = []
test_accuracy = []

# We define a range of 1 to 19 (included) neighbors that will be tested
neighbors_settings = range(3,11)


for n_neighbors in neighbors_settings:
    
    print("n_neighbors",n_neighbors)
    
    # creating the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # fitting the model
    knn.fit(X_train, y_train)
    
    #recording the accuracy of the training set
    training_accuracy.append(knn.score(X_train, y_train))
    
    #recording the accuracy of the test set
    test_accuracy.append(knn.score(X_test, y_test))