In [55]:
import glob
import pandas as pd
import datetime
import numpy.fft as fft
import matplotlib.pyplot as plt
import os
plot = False

# Preprocessing data.
Each file has a second of data in milliseconds (1000 measurements per file), the files aren't equally distributed in time we have big gaps between files, so we need to make some sort of preprocessing of the data before trying any anomaly detection algorithm.

In [None]:
def timestamper(timestamp):
    return datetime.datetime.fromtimestamp(float(timestamp)/1000)

## Reading raw data

In [52]:
files = glob.glob("data/raw/*.csv")

## FFT
Instead of using the data in the time domain we can transform it to the frequency domain and use each file as an example, then try to use clustering to identify at least 3 clusters (off, working, malfunctioning)

In [60]:
for i,file in enumerate(files):
    df = pd.read_csv(file,index_col="tiempo", parse_dates=True, date_parser=timestamper)
    df_fft = {}
    for column in df.columns:
        df_fft[column] = fft.rfft(df[column]).real
    df_fft = pd.DataFrame(df_fft)
    if plot:    
        print("File {}, filename: {}".format(i+1,file))
        df_fft.plot()
        plt.legend()
        plt.show()
    df_fft.to_csv(os.path.dirname(file).replace("raw", "fft/")+os.path.basename(file).replace(".csv", ".fft.csv"))

## Statistics
We can transform each file in a set of statistical values like mean an variance, this will give us a time series

In [99]:
df_results = []
for i,file in enumerate(files):
    df = pd.read_csv(file,index_col="tiempo", parse_dates=True, date_parser=timestamper)
    df_stats = {}
    for column in df.columns:
        df_stats[column+"-mean"] = df[column].mean()
        df_stats[column+"-std"] = df[column].std()
        df_stats[column+"-min"] = df[column].min()
        df_stats[column+"-max"] = df[column].min()
        df_stats[column+"-median"] = df[column].median()
        df_stats[column+"-kurt"] = df[column].kurt()
        df_stats[column+"-mad"] = df[column].mad()
        df_stats[column+"-var"] = df[column].var()

    df_stats["time"] = df.index[0]
    df_results.append(df_stats)

In [100]:
df_results = pd.DataFrame(df_results)

In [104]:
df_results.index = df_results.time
df_results = df_results.drop("time",axis=1)

In [112]:
df_results.to_csv("data/vibration_statistics.csv")