<a href="https://colab.research.google.com/github/charlotteVDD/neuroForest/blob/main/Preprocessing_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade gspread


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gspread
  Downloading gspread-5.4.0-py3-none-any.whl (37 kB)
Installing collected packages: gspread
  Attempting uninstall: gspread
    Found existing installation: gspread 3.4.2
    Uninstalling gspread-3.4.2:
      Successfully uninstalled gspread-3.4.2
Successfully installed gspread-5.4.0


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import csv
import functools 
import scipy
 
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request

import os
from scipy.spatial import ConvexHull
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform

from google.colab import auth
auth.authenticate_user()
 
import gspread
from oauth2client.client import GoogleCredentials
from json import JSONEncoder

from sklearn.neighbors import NearestNeighbors
import math

Mounted at /content/drive


In [None]:
class PreprocessingData(object):

  def __init__(self,data,scores_dict):
   
    self.parameters = {}
    

    # extract positions
    self.positions = data['Positions']
    print(pd.DataFrame(self.positions).columns)
    print(pd.DataFrame(self.positions).head())
    self.positions  = pd.DataFrame(self.positions).to_numpy() #column 0 - x, column 1 - y, column 2 - z
    print(self.positions[0:5])
    # extract nickname (there are some files with nickname of type list)
    if isinstance(data['Nom'], list):
      self.parameters['Nickname'] = data['Nom'][0]
    else:
      self.parameters['Nickname'] = data['Nom']
    print(self.parameters['Nickname'])
    # extract TDAH score
    print(scores_dict.get(self.parameters['Nickname']))
    try:
      self.parameters['ADHD score'], self.parameters['ADHD label'] = scores_dict.get(self.parameters['Nickname'])
    except:
      self.parameters['ADHD score'], self.parameters['ADHD label'] = None, None
    if "Q1" in data.keys():
      self.parameters['ADHD score'] = self.TDAH_score_in_built(data)
      self.parameters['ADHD label'] = self.TDAH_score_label(data)
    print("ADHD score label: " + str(self.parameters['ADHD label']))  
    # extract score
    if "Score" in data.keys():
      self.scores = np.array(data['Score'])
      self.parameters['Total gain'] = self.scores[-1]
      if "Champignons dans champ de vision" in data.keys():
        self.parameters['Variance with greedy strategy'] = self.greedy_visible(data)
        print(self.parameters['Variance with greedy strategy'])
      else:
        self.parameters['Variance with greedy strategy'] = None
    else:
      self.scores = []
      self.parameters['Variance with greedy strategy'] = None
      try:
        self.parameters['Total gain'] = len(data['Champignons ramassés'])
      except:
        self.parameters['Total gain'] = len(data['ChampignonsRamassés'])
    # total distance
    self.parameters['Total distance'] = self.total_distance(self.positions)

    # Convex envelope volume
    points = self.positions [:,[0,2]]
    hull = ConvexHull(points)
    self.parameters['Convex envelope volume'] = hull.volume

    # Shape of data
    self.shape = np.shape(self.positions)[0]
    
    # Diffusion coefficient variance
    self.T = 100  # assez grand pour que la moyenne ne dépende pas de T
    self.tau_max = 100
    self.parameters['Mean diffusion coefficient'], self.parameters['Diffusion coefficient variance'] = self.mean_and_variance_diff(self.positions )
    self.parameters['Mean alpha (diffusion model)'], self.parameters['Alpha std(diffusion model)']  =  self.var_alpha(self.positions) #returns mean value and fluctuation

    
    # extract time
    if "Temps" in data.keys():
      self.time = int(data['Temps'][0])
      self.parameters['Time'] = self.time

      #if there was an experiment of 10 minutes, we cut it 
      if self.time == 616:
        self.positions = self.positions[:15001]
        self.scores = self.scores[:15001]
        self.shape = np.shape(self.positions)[0]

      if self.time > 0:
        self.dt = self.time / self.shape
        # Velocity autocorrelation
        self.tau = np.arange(100)
        self.parameters['Persistence time begining'] = self.time_of_persistence(self.positions[:(len(self.positions)//2)], self.tau)
        self.parameters['Persistence time end'] = self.time_of_persistence(self.positions[(len(self.positions)//2):], self.tau)

        # Curvature
        k = 20 # number of frames to use for calculating derivatives
        self.parameters['Curvature'] = self.mean_curvature(self.positions,self.dt,k)
        
        if np.shape(self.positions)[0] == len(self.scores): 
          self.parameters['Trajectory volatility'] = self.trajectory_volatility(data)
        else:
          self.parameters['Trajectory volatility'] = None
      else:
        self.parameters['Persistence time begining'] = None
        self.parameters['Persistence time end'] = None
        self.parameters['Curvature'] = None
        self.parameters['Trajectory volatility'] = None
    else:
      self.parameters['Persistence time begining'] = None
      self.parameters['Persistence time end'] = None
      self.parameters['Curvature'] = None
      self.parameters['Trajectory volatility'] = None

    
    self.parameters['Diameter'] = self.diameter(self.positions)



  def total_distance(self,positions):
    dist = 0
    for i in range(len(positions) - 1):
      dist += np.sqrt((positions[i+1][0]-positions[i][0])**2 + (positions[i+1][1]-positions[i][1])**2 + (positions[i+1][2]-positions[i][2])**2 )
    return round(dist,3)

  def TDAH_score_in_built(self, reponses):
    questions_list = reponses.keys()
    list_long_scale = ["Q1","Q2","Q3","Q9","Q12","Q16","Q18"] #questions that are more important
    list_short_scale = ["Q4","Q5","Q6","Q7","Q18","Q10","Q11","Q13","Q14","Q15","Q17"]
    TDAH_score = 0
    
    for q in questions_list:
      if q in list_long_scale:
        try:
          TDAH_score += np.max([0,reponses[q][0]-0.4])*5
        except:
          TDAH_score += 0
      if q in list_short_scale:
        try:
          TDAH_score += np.max([0,reponses[q][0]-0.6])*5
        except:
          TDAH_score += 0
    return TDAH_score

  def TDAH_score_label(self, reponses):
    questions_list = reponses.keys()
    list_long_scale = ["Q1","Q2","Q3"] #questions that are more important
    list_short_scale = ["Q4","Q5","Q6"]
    TDAH_symptoms = 0
    
    for q in questions_list:
      if q in list_long_scale:
        try:
          if (reponses[q][0] > 0.4):
            TDAH_symptoms += 1
        except:
          TDAH_symptoms += 0
      if q in list_short_scale:
        try:
          if (reponses[q][0] > 0.6):
            TDAH_symptoms += 1
        except:
          TDAH_symptoms += 0
    if (TDAH_symptoms > 3):
      return 1
    return 0

  def mean_and_variance_diff(self, points):
    t0  = np.arange(0,self.shape-self.T-self.tau_max)
    diff = [self.diffusion(points, t) for t in t0]
    return np.mean(diff),np.sqrt(np.var(diff))/np.mean(diff)

  def diffusion(self, points, t0):
    tau = np.arange(100)
    y = [self.dqm(points, i, t0) for i in tau]
    coef = np.polyfit(tau,y,1)
    return coef[0]/4

  def dqm(self, data, tau, t0):
    xdata = data[:, 0]
    ydata = data[:, 2]
    x1 = xdata[t0 : t0 + self.T]
    y1 = ydata[t0 : t0 + self.T]
    x2 = xdata[t0 + tau : t0 + self.T + tau]
    y2 = ydata[t0 + tau : t0 + self.T + tau]
    xdiff = (x2 - x1)**2
    ydiff = (y2 - y1)**2
    return 1/self.T*(np.sum(xdiff) + np.sum(ydiff))
  
    #fit du msd en A*t^alpha et renvoie le alpha du fit
  def alpha_fit(self, points, t0):
    tau = np.arange(1,100)
    y = [self.dqm(points, i, t0) for i in tau]
    if not (math.isinf(np.log(y[0]))):
      coef = np.polyfit(np.log(tau), np.log(y), 1, w=np.sqrt(y)) #power law fit
      return np.exp(coef[1]), coef[0] #returns A and alpha
    else:
      return np.nan, np.nan
  # calcul de la moyenne et de la variance d'un coefficient de variation du alpha
  def var_alpha(self, points):
    t0  = np.arange(0,self.shape-self.T-self.tau_max)
    alpha = np.array([self.alpha_fit(points, t) for t in t0])
    return np.nanmean(alpha), np.sqrt(np.nanvar(alpha))/np.nanmean(alpha) #returns mean value and fluctuation

  def time_of_persistence(self, points, tau):
      y = [self.moyenne_autocorrelation(points, i) for i in tau]
      try:
        coef = np.polyfit(tau, np.log(y), 1, w=np.sqrt(y)) #exponential fit
        time_persistence = 1/(-coef[0])
      except:
        time_persistence = None

      return time_persistence

  def moyenne_autocorrelation(self, points, tau):
    return np.mean([self.autocorrelation(points, t, tau) for t in range(len(points)-tau-1)])
  
  def autocorrelation(self, points, t, tau):
    return self.velocity(points, t)[0]*self.velocity(points, t+tau)[0] + self.velocity(points, t)[1]*self.velocity(points, t+tau)[1] + self.velocity(points, t)[2]*self.velocity(points, t+tau)[2]

  def velocity(self, data, t):
    xdata = data[:, 0]
    ydata = data[:, 1]
    zdata = data[:, 2]
    vx = (xdata[t + 1] - xdata[t])/self.dt
    vy = (ydata[t + 1] - ydata[t])/self.dt
    vz = (zdata[t + 1] - zdata[t])/self.dt
    return vx, vy, vz

  def mean_curvature(self,points,dt,k):
    x = points[:,0]
    y = points[:,2]
    dx = (x[:(len(x)-k)] - x[k:])/ k*dt # on dérive temporellement chaque coordonnée une fois
    dy = (y[:(len(y)-k)] - y[k:])/ k*dt
    dx_2 = (dx[:(len(dx) - k)] - dx[k:])/ k*dt # on dérive une deuxième fois
    dy_2 = (dy[:(len(dy) - k)] - dy[k:])/ k*dt

    curvature = np.zeros(len(dx_2))

    for i in range(len(dx_2)):
        if (dx[i]**2+dy[i]**2) > 1e-6:
            curvature[i] = np.abs(dx[i]*dy_2[i]-dy[i]*dx_2[i])/(dx[i]**2+dy[i]**2)**1.5
        else:
            curvature[i] = None


    curvature = curvature[~np.isnan(curvature)]

    return np.mean(curvature)

  # distance d'un point à la droite passant par p1 et p2
  def distance_droite(self, indice, points, p1, p2):
    p3 = points[indice]
    if np.array_equal(p1, p2):
      return 0
    else :
      return np.linalg.norm(np.cross(p2-p1, p1-p3))/np.linalg.norm(p2-p1)

  def last_mushroom(self, data):
    positions = data['Positions']
    positions = pd.DataFrame(positions).to_numpy()
    scores = data['Score']
    n = np.shape(positions)[0]
    last = np.zeros((n,3))
    last[0] = positions[0]
    for i in range(1,n):
      if (scores[i-1] != scores[i]):
        last[i] = positions[i]
      else :
        last[i] = last[i-1]
    return last

  def next_mushroom(self, data):
    positions = data['Positions']
    positions = pd.DataFrame(positions).to_numpy()
    scores = data['Score']
    mushroom_indices = np.argwhere(np.diff(scores) == 1)
    last = self.last_mushroom(data)
    n = len(last)
    next = np.zeros((n,3))
    i = 0
    for l in range(len(mushroom_indices)-1):
      j = mushroom_indices[l+1][0]
      while i <j:
        next[i] = last[j]
        i += 1
    while i < n: #for the last indices where there is no actual next mushroom
      next[i] = positions[-1]
      i += 1
    return next


  def trajectory_volatility(self, data):
    res = 0
    positions = data['Positions']
    positions = pd.DataFrame(positions).to_numpy()
    last = self.last_mushroom(data)
    next = self.next_mushroom(data)
    n = np.shape(positions)[0]
    for i in range(n):
      dist = self.distance_droite(i, positions, last[i], next[i])**2
      res += dist
    return np.sqrt(res/n)

  def diameter(self, positions):
    D = pdist(positions)
    D = squareform(D)
    return np.nanmax(D)
  
  def greedy_visible(self, data):
    positions = data['Positions']
    positions = pd.DataFrame(positions).to_numpy()

    #if there was an experiment of 10 minutes, we cut it 
    positions = positions[:15001]
    self.scores = self.scores[:15001]
    
    #n = np.shape(positions)[0]
    MushroomsFieldView = data["Champignons dans champ de vision"]
    visible = []
    for frame in range(len(MushroomsFieldView)):
        MushroomsPostionsInFrame = pd.DataFrame(MushroomsFieldView[frame], index = np.arange(len(MushroomsFieldView[frame]))).to_numpy()
        visible.append(MushroomsPostionsInFrame)
    mushroom_indices = np.argwhere(np.diff(self.scores) == 1)
    scal = []
    nearest_mushrooms = []
    for i in range(len(mushroom_indices)-1):
      last = positions[mushroom_indices[i][0]]
      next = positions[mushroom_indices[i+1][0]]
      neigh = NearestNeighbors(n_neighbors=1)
      if (len(visible[mushroom_indices[i][0]]) != 0):
        neigh.fit(visible[mushroom_indices[i][0]])
        nearest_idx = neigh.kneighbors(np.array([last]), return_distance=False)[0][0]
        nearest = visible[mushroom_indices[i][0]][nearest_idx]
        nearest_mushrooms.append(nearest)
        scal.append(np.dot(next-last, nearest-last))
    print(scal)
    return np.mean(np.array(scal))
    

In [None]:
RawDataDirectory = '/content/drive/My Drive/raw_Data2'
PreprocessedDataDirectory = '/content/drive/My Drive/raw_Data2'
SaveTableDirectory = '/content/drive/My Drive/'

TdahScoreDictionary = {'SDVZPP': [5,0], 'CEMMOE': [8,0], 'Emilia': [11.567459, 1]}
try:
  DataToAppend = pd.read_csv(SaveTableDirectory + 'All_experiments_test_MIPT.csv')
except:
  DataToAppend = pd.DataFrame({})

for filename in os.listdir(RawDataDirectory):
    if filename.endswith(".json"):
      #load data 
      print(filename + " is loading.")
      with open(os.path.join(RawDataDirectory, filename), 'r') as myfile:
        try:
          data = json.loads(myfile.read())
        except:
          continue
      print(filename + " is loaded.")
      
      #preprocess data
      RowToAppend = PreprocessingData(data,TdahScoreDictionary)
      print(RowToAppend)
      print('lol')
      #append new row
      DataToAppend = DataToAppend.append(RowToAppend.parameters, ignore_index = True)
      os.rename(os.path.join(RawDataDirectory, filename), os.path.join(PreprocessedDataDirectory, filename))
      #append new row to csv table
      DataToAppend.to_csv(path_or_buf = SaveTableDirectory + 'All_experiments_test_MIPT.csv',index=False)

print(DataToAppend.info())




MwEToo.json is loading.
MwEToo.json is loaded.
Index(['x', 'y', 'z'], dtype='object')
            x         y           z
0 -121.653000  7.640000  195.311996
1 -121.658539  7.738482  195.315689
2 -121.658539  7.738059  195.315689
3 -121.658539  7.738059  195.315689
4 -121.658539  7.738059  195.315689
[[-121.65299988    7.63999987  195.31199646]
 [-121.65853882    7.73848248  195.31568909]
 [-121.65853882    7.73805857  195.31568909]
 [-121.65853882    7.73805857  195.31568909]
 [-121.65853882    7.73805857  195.31568909]]
MwEToo
None
ADHD score label: 0




<__main__.PreprocessingData object at 0x7f3f3ba75b50>
lol
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ADHD label                      66 non-null     float64
 1   ADHD score                      66 non-null     float64
 2   Alpha std(diffusion model)      65 non-null     float64
 3   Convex envelope volume          66 non-null     float64
 4   Curvature                       66 non-null     float64
 5   Diameter                        66 non-null     float64
 6   Diffusion coefficient variance  65 non-null     float64
 7   Mean alpha (diffusion model)    65 non-null     float64
 8   Mean diffusion coefficient      65 non-null     float64
 9   Nickname                        66 non-null     object 
 10  Persistence time begining       65 non-null     float64
 11  Persistence time end            65 non-nu

In [None]:
RawDataDirectory = '/content/drive/My Drive/raw_data'
PreprocessedDataDirectory = '/content/drive/My Drive/preprocessed_data' 

Un cercle:

In [None]:
DataToAppend.head()

Unnamed: 0,ADHD label,ADHD score,Alpha std(diffusion model),Convex envelope volume,Curvature,Diameter,Diffusion coefficient variance,Mean alpha (diffusion model),Mean diffusion coefficient,Nickname,Persistence time begining,Persistence time end,Time,Total distance,Total gain,Trajectory volatility,Variance with greedy strategy
0,0.0,8.0,1.106972,15203.365081,0.077494,215.266731,0.979705,0.842415,0.038763,CEMMOE,95.342421,60.796695,300.0,521.21,100.0,0.035725,
1,0.0,8.0,1.131687,54604.462279,0.06606,331.035638,0.698525,0.923116,0.082,CEMMOE,189.763018,260.419435,300.0,762.815,59.0,0.265156,
2,0.0,8.0,1.193794,8285.114711,0.111584,179.979325,1.15573,0.82222,0.033888,ABCDEFG,83.430124,54.873569,300.0,461.116,73.0,0.189848,
3,1.0,12.559854,0.997498,26381.792238,0.263786,270.327591,0.477449,0.836362,0.090927,VYMNSI,68.468656,58.332587,300.0,1148.911,121.0,0.150781,
4,0.0,4.961971,1.115585,9330.913248,0.108585,168.840458,0.895122,0.859979,0.044678,QOEPNL,83.630162,85.039337,300.0,575.816,83.0,0.69648,


In [None]:
for nm in DataToAppend['Nickname']:
  print(nm)



CEMMOE
CEMMOE
ABCDEFG
VYMNSI
QOEPNL
LDPTTB
OBBOIU
dcMTCA
R97Fys
jJHldO
XEai0l
ag6IG7
JiRz2t
89iYbS
pdYVv5
rztNhW
NtyRgW
YCQhDz
INl6hv
4Ely6y
fzlIKW
MOCb1L
nMxqz7
MQL1ks
qhcQh7
gUcsRM
zbdvjE
zbdvjE
2CXFjz
tjH67G
JITgja
CQodgV
W85ssB
ehkvif
KoEq9h
fjNfaJ
pNOOnW
Rk92cW
T0etzr
2fLG2s
fdqEH9
cIsQo3
I2lpxS
e1Pnin
ypDRoN
HzFjeo
NnRuIT
ngGAeh
NEsaXU
jJ3uIx
8AzimM
1KMzRM
7Ux6rn
S9RiE8
EB3Bba
Q0YMK4
orseDd
YxPvyy
IM8K39
85byh2
u8BQa0
INl6hv
INl6hv
YkWbcJ
