In [None]:
"""
Overall things to think about/modify:
(in no particular order)
1) How does the PIR data work? Comments about this are throughout the functions, but need to understand how it works for various functions
2) How are the functions going to be called? Will the classification be run across a specific time period (say an hour), and then the data from that be passed onto all various functions?
The structure of the programme will impact the functions - so the below functions WILL change


"""

In [1]:
import csv
import datetime
import random
import glob
import os
import tensorflow as tf
import tensorflow_hub as hub
from IPython.display import Audio
import numpy as np
import scipy
from scipy.io import wavfile
import soundfile as sf
import resampy
import librosa

  "class": algorithms.Blowfish,


In [None]:
"""
Data is in epoch timestamp; should be in dataframe
"""
def format_timestamp(time):
    return datetime.datetime.fromtimestamp(int(time))

In [None]:
files = glob.glob("../..//../Desktop/archive/KETI/*/pir.csv")
pir_data = [] #Format of this will depend on how much data we are collecting, what we need it for
for file in files:
    day = []
    f = open(file)
    reader = csv.reader(f)
    for line in reader:
        time = format_timestamp(line[0])
        ts = float(line[1])
        day.append((time,ts))
    pir_data.append(day)

In [None]:
"""
Timedelta to minutes (for checking against alerts)
"""
def get_minutes(td):
    return td.seconds/60

In [None]:
"""
If you only want to look at the data between certain times
"""
def get_modified_data(data,start,end=None):
    if end == None:
        return [el for el in data if el[0].time() >= start]
    return [el for el in data if (el[0].time() >= start and el[0].time() <= end)]


In [None]:
NIGHT_HOUR = 21 #should change/be personalised?
MORNING_HOUR = 8
ALERT_TIME = 30 #should be changed
LIM_TIMESTAMP_STREAK = 10  # Number of timestamps for which there's no movement

In [None]:
"""
Check if person is in one room for too long
"Too long" is room-dependent, but this is just assuming a basic rule of 30 minutes (for bathrooms and stuff)

The first timestamp of movement is stored, and the subsequent movement timestamps are compared to this one. If the timedelta is >= the max time (30 minutes), an alert would be raised.
If there has been a period of (10) consecutive timestamps with no motion, there is an assumption FOR NOW that the person has left the room. If this is the case, all stored variables are reset.
The next timestamp of movement after this would be stored and the process would begin again. 

--
This function would also depend on what the room data looks like; if it's fed hours of data, this function would check whether someone has spent longer than X in the room



To check/change:
10 periods of no movement is arbitrary
What happens with the PIR data if someone is asleep/staying still but still in the room? 
30 minutes in the bathroom being too long is arbitrary
If the PIR sensor does give us non-binary data, what can I do with that?
Do I check other rooms as well?
"""

def checkRoomLength(room_data): 
    start_idx = -1
    no_motion_streak = 0
    
    for i in range(len(room_data)):
        ts, val = room_data[i]
        if val == 0 and start_idx == -1: #Room is empty
            continue
        if val == 0 and start_idx != -1:
            no_motion_streak +=1
        if no_motion_streak > LIM_TIMESTAMP_STREAK: #assume at this point that nobody is in the room
            start_idx = -1
            no_motion_streak = 0
            continue
        if start_idx == -1: start_idx = i #initialise beginning of movement period
        movement_delta = room_data[i][0]-room_data[start_idx][0]
        minutes =  get_minutes(movement_delta) # convert to minutes
        if minutes > ALERT_TIME: 
            print(minutes)
            print("Too long!") #raise alert
            return
    return

In [None]:
"""
While the wakeup settings for the individual person should be personalisable, they need to be set at something at first.
We need to consider if their routine is the same each day, or if there are days that they sleep in (i.e. should this function take day of the week into account)
How long do we allow before we start worrying? - if they tend to be up every day around 7, if there isn't any movement at 715, we probably shouldn't send an alarm. But what is that time period?

The base function assumes that they are normally up every day by 8, and we can give them 30 minutes of extra sleep. 
If movement is detected during the night, it checks for 5 consecutive timestamps of movement; if that condition is met, person is up. 
If time is after allowed time, checks for 5 consecutive timestamps of no movement. If this condition is met, returns an alert
Check for three consecutive movement timestamps, and then returns that the person is up

Assumptions/questions:
There is no PIR movement when they are asleep
What happens if they are moving around a lot before wakeup time? Do we care? What if they do this, and then go back to sleep and sleep late?
Question for later: how do we detect naps? This wouldn't necessairly be "in a room for too long" if their PIR movement is 0 when they're asleep

"""
MORNING_BUFFER = datetime.time(minute=30)
wakeup = datetime.time(hour=MORNING_HOUR)
morning = datetime.time(hour=(MORNING_BUFFER.hour+ wakeup.hour),minute=(MORNING_BUFFER.minute + wakeup.minute))
def inBedTooLong(bedroomData):
    timestamp_count = [0,0] #awake, asleep
    for i in range(len(bedroomData)):
        ts,val = bedroomData[i]
        ts = ts.time()
        if ts <= morning:
            if val == 0: # still in the period which they are allowed to sleep, so we don't care
                if timestamp_count[0]  > 0: timestamp_count[0]  = 0 #detected no movement, so reset to 0
            elif val > 0:
                timestamp_count[0]  +=1
            if timestamp_count[0]  >= 5: return 0 # No issue
        if ts > morning:
            if i > 0 and bedroomData[i-1][0].time() <= morning:
                timestamp_count = [0,0] #reset variables
            if val == 0: timestamp_count[1] +=1
            if timestamp_count[1] >= 5: return -1 #return error
            if val > 0:  timestamp_count[0] +=1 #person is moving
            if timestamp_count[0] >= 3: return 1 #person is up
            
            
                

    
    

In [None]:
"""
Will need to check multiple rooms and door audio (door audio I don't have)
This also currently assumes 24 hour time. 
How would this function be called? Upon hearing a door sound or upon there not being activity for a certain amount of time? (or just routinely check) 
The way this function would be written would depend on its purpose.

Currently: this assumes that a door sound has been heard. 
This code checks through all the rooms, and if movement is seen within a certain time buffer, doesn't raise an alert. 
Assumptions/choices to be made:
1) I need to decide at what point no activity within any room is concerning. This can't be too short of a time, but also can't be too long!
2) This code is assuming door sounds can be detected
3) What happens if someone has a cat/dog? 
"""
BUFFER = datetime.timedelta(minutes=30)
def leftAtNight(door_timestamp, room_data):
    if door_timestamp.hour >= MORNING_HOUR and door_timestamp.hour < NIGHT_HOUR: return #f the timestamp isn't in the period we care about, no need to check
    toRaiseAlert = True
    for room in room_data:
        data = get_modified_data(room,door_timestamp,door_timestamp + BUFFER)
        #Need to decide at what point lack of activity would be concerning
        for (ts,val) in data: 
            if val > 0: 
                toRaiseAlert = False #Does action need to be seen for a certain amount of sampling? Or just any action
                break 
        if toRaiseAlert == False: break
    if toRaiseAlert: 
        print("Oh no, Alert")
                
    
   
    return     
        
    
    

In [2]:
  """Returns list of class names corresponding to score vector."""
def class_names_from_csv(class_map_csv_text):
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
    for row in reader:
        class_names.append(row['display_name'])

    return class_names



In [3]:
  """Resample waveform if required."""
def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000):
    if original_sample_rate != desired_sample_rate:
        desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate))
        waveform = scipy.signal.resample(waveform, desired_length)
    return desired_sample_rate, waveform

In [4]:
def getModel():
    os.environ["TFHUB_CACHE_DIR"] = "\\Users\\chloe\\Documents\\tensorflow"
    model = hub.load('https://tfhub.dev/google/yamnet/1')
    class_map_path = model.class_map_path().numpy()
    class_names = class_names_from_csv(class_map_path)
    return model, class_names

In [5]:
"""
split on silence doesn't seem to be possible if I combine librosa and soundfile (look further into this!)
Currently, just splitting on sound length
Returns samplerate for later
"""

def splitFile(file_name, segment_dur):
    data,sr = sf.read(file_name, dtype=np.int16)
    segment_length = sr * segment_dur
    num_sections = int(np.ceil(len(data) / segment_length))
    sections = []
    for i in range(num_sections):
        t = data[i * segment_length: (i + 1) * segment_length]
        sections.append(t)
    return sections,sr              

In [6]:
SAMPLE_RATE = 16000.0
"""
Classifies audio data according to the AudioSet Yamnet Data
Code is adapted from the interference.py code from the official documentation 
If the wave is not in the right shape or not sampled in the right sampling rate, the wave is fixed so it fits the data
Then the wave is classified.

The output of the model is a matrix of (# time frames, # classes) classifier scores; the documentation recommends taking the mean across the 0th axis to get an average across time
Thus, we can see the average classifier score across time, not just for a specific timeframe. 

Returned are the means and standard deviations.
"""
def getClassifications(model,wav_data,sr):
    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    waveform = waveform.astype('float32')
    if len(waveform.shape) > 1:
        waveform = np.mean(waveform, axis=1)
    if sr != SAMPLE_RATE:
        waveform = resampy.resample(waveform, sr, SAMPLE_RATE)
    
    scores, embeddings, spectrogram = model(waveform)
    prediction = np.mean(scores, axis=0) #Averaged across time
    std = np.mean(scores,axis=0)
    return prediction,std

In [7]:
"""
After the model scores all the classes, this function gets the top five classifications and matches them to the class names.
The returned result is an array with (class name,probablity) for the top five. 
"""
def getTopFive(predictions):
    top5_i = np.argsort(predictions)[::-1][:5]
    toRet = []
    for i in top5_i:
        toRet.append(class_names[i])
        toRet.append(predictions[i])
    return toRet
    

In [8]:

model,class_names= getModel()

In [None]:
#split,sr = splitFile("../..//../Desktop/Tallie Sounds/door1.wav",2)
#split,sr = splitFile( "../..//../Desktop/Trainingdata/Snoring Dataset/Snoring/1_46.wav",2)
split,sr = splitFile("../..//../Desktop/audio/0-146774-A-4.wav",2)

In [None]:
f = open("../../../Desktop/audio_mapping.csv")
f.readline()
ids = {}
reader = csv.reader(f)
for line in reader:
    path = line[1]
    label,path = path.split("/")
    if label in ["book","eat"]: continue
    id_ = path.split("_")[0]
    ids[id_] = label


In [None]:
set(list(ids.values()))

In [None]:
files = glob.glob("../..//../Desktop/audio/*.wav")

false = 0
total = 0
others = []
dd = {}
for file in files:
    id_ =  file.split("/")[-1].split("-")[1]
    if id_ not in ids: continue
    split, sr = splitFile(file,2)
    for el in split:
        p,s = getClassifications(model,el,sr)
        prediction = getTopFive(p)
        if prediction[0] in names: false +=1
        else:
            others.append(prediction[0])
        if prediction[0] not in dd: dd[prediction[0]]=[]
        dd[prediction[0]].append(file)
        total +=1
false, total, false/total, (total-false)/total

In [None]:
from collections import Counter
x = Counter(others)
x.most_common()

In [None]:
dd['Water']


In [None]:
count = 0
total = 0
for i in range(len(split)):
    section = split[i]
    p,s = getClassifications(model,section,sr)
    print(getTopFive(p))

In [None]:
"""
Need index of Cough ID in class names (to get probability) for cough profile. I assume this won't change from run to run, but just in case, don't want to hardcode it
Also, if we decide to add more categories as labels to cough (throat clearing, for example), this will come in handy
"""
def getCoughIndex():
    for i in range(len(class_names)):
        if class_names[i] == "Cough":
            return i
        

In [None]:
"""
Takes a set of audiodata and returns probablity of cough

TODO: Future versions would have timestamps, but I don't have enough coughs with timestamps to create that kind of data
INCORPORATE standard deviations somehow
"""
COUGH_INDEX = getCoughIndex()
def getCoughProfile(audiodata):
    results = []
    for frame in audiodata:
        p,s = getClassifications(model,frame,sr)
        results.append(p[COUGH_INDEX])
    
    
    return results

In [None]:
files  = glob.glob("../..//../Desktop/Trainingdata/audio_data/cough/*.wav")[0:25]
data = []
for file in files:
    split,sr = splitFile(file,2)
    data.extend(split)
getCoughProfile(data)

In [2]:
import json
jsondata = json.load(open("Datasets/ontology.json"))

In [3]:
d = {}
for el in jsondata:
    id = el['id']
    name = el['name']
    children = el['child_ids']
    d[id] = (children,name)
    

In [11]:
children = d["/m/0dgw9r"][0]
names = []
for el in children:
    names.extend(get_subordinates(d,el))
    
names

['Human voice',
 'Speech',
 'Male speech, man speaking',
 'Female speech, woman speaking',
 'Child speech, kid speaking',
 'Conversation',
 'Narration, monologue',
 'Babbling',
 'Speech synthesizer',
 'Shout',
 'Bellow',
 'Whoop',
 'Yell',
 'Battle cry',
 'Children shouting',
 'Screaming',
 'Whispering',
 'Laughter',
 'Baby laughter',
 'Giggle',
 'Snicker',
 'Belly laugh',
 'Chuckle, chortle',
 'Crying, sobbing',
 'Baby cry, infant cry',
 'Whimper',
 'Wail, moan',
 'Sigh',
 'Singing',
 'Choir',
 'Yodeling',
 'Chant',
 'Mantra',
 'Male singing',
 'Female singing',
 'Child singing',
 'Synthetic singing',
 'Rapping',
 'Humming',
 'Groan',
 'Grunt',
 'Yawn',
 'Whistling',
 'Wolf-whistling',
 'Respiratory sounds',
 'Breathing',
 'Wheeze',
 'Snoring',
 'Gasp',
 'Pant',
 'Snort',
 'Cough',
 'Throat clearing',
 'Sneeze',
 'Sniff',
 'Human locomotion',
 'Run',
 'Shuffle',
 'Walk, footsteps',
 'Digestive',
 'Chewing, mastication',
 'Biting',
 'Gargling',
 'Stomach rumble',
 'Burping, eructation'

In [8]:
def get_subordinates(dictionary, label):
    children, name = dictionary[label]
    if children == []: return [name]
    toRet = [name]
    for el in children:
        toRet.extend(get_subordinates(dictionary, el))
    return toRet
                
        
    

In [10]:
for key in d:
    vals = d[key]
    if  "/m/09hlz4" in vals[0]:
        print(key, vals[1])

/m/0dgw9r Human sounds
