In [1]:
import os
import os.path
import fnmatch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import matplotlib.mlab as mlab
import matplotlib.dates as mdates
import datetime as dt

# plotting params
# %matplotlib inline
# plt.style.use('seaborn')
# plt.rcParams['font.size'] = 10
# plt.rcParams['axes.labelsize'] = 10
# plt.rcParams['axes.labelweight'] = 'bold'
# plt.rcParams['axes.titlesize'] = 10
# plt.rcParams['xtick.labelsize'] = 8
# plt.rcParams['ytick.labelsize'] = 8
# plt.rcParams['legend.fontsize'] = 10
# plt.rcParams['figure.titlesize'] = 12
plt.rcParams['figure.figsize'] = (15.0, 8.0)
# plt.rcParams['savefig.dpi'] = 100
sns.set_style("white")

# for reloading modules
%load_ext autoreload
%autoreload 2

# path params
root_dir = '/Users/Farah/Desktop/Matteo/data/'
plot_dir = '/Users/Farah/Desktop/Matteo/data/'

## Information

* Each folder is a different date range
* Each file in the folder is the same date range but different conditions

* Have a graph for same file across 7 samples. This is equivalent to seeing the same condition over 7 different date ranges.
* Have a graph for average of v1 and v2 of same file across all 7 samples.

### Todo

* Fix x-axis
* PDF of all
* Outliers??

## Helper Functions

In [2]:
def pad_zeros(s):
    if len(s) == 1:
        return s.zfill(2)
    return s

def pad_year(s):
    if s[2][0] != '2':
        s[2] = '20' + s[2]
    
def fix_dates(d):
    ret = d.split('/')
    ret = list(map(pad_zeros, ret))
    pad_year(ret)
    return "/".join(ret)

## Read Directory

In [3]:
# path params
data_dir = root_dir + 'sample1/'
pattern = '*.csv'

# crawl directory and grab filenames
names = []
for path, subdirs, files in os.walk(data_dir):
    for filename in files:
        if fnmatch.fnmatch(filename, pattern):
            names.append(os.path.join(path, filename))

# remove original files
names = [x for x in names if 'original' not in x]
num_files = len(names)
print("\nThere are {} files.".format(num_files))


There are 46 files.


In [4]:
unique_files = [name[48:] for name in names]

In [5]:
unique_files

['Fshl_iw_1.csv',
 'Fshl_iw_2.csv',
 'Fshl_ow_1.csv',
 'Fshl_ow_2.csv',
 'Fshm_iw_1.csv',
 'Fshm_iw_2.csv',
 'Fshm_ow_1.csv',
 'Fshm_ow_2.csv',
 'Fshs_iw_1.csv',
 'Fshs_iw_2.csv',
 'Fshs_ow_1.csv',
 'Fshs_ow_2.csv',
 'Fsul_iw_1.csv',
 'Fsul_iw_2.csv',
 'Fsul_ow_1.csv',
 'Fsum_iw_1.csv',
 'Fsum_iw_2.csv',
 'Fsum_ow_1.csv',
 'Fsum_ow_2.csv',
 'Fsus_iw_1.csv',
 'Fsus_iw_2.csv',
 'Fsus_ow_1.csv',
 'Fsus_ow_2.csv',
 'Hshl_iw_1.csv',
 'Hshl_iw_2.csv',
 'Hshl_ow_1.csv',
 'Hshl_ow_2.csv',
 'Hshm_iw_1.csv',
 'Hshm_iw_2.csv',
 'Hshm_ow_1.csv',
 'Hshm_ow_2.csv',
 'Hshs_iw_1.csv',
 'Hshs_iw_2.csv',
 'Hshs_ow_1.csv',
 'Hshs_ow_2.csv',
 'resting_1.csv',
 'resting_10.csv',
 'resting_11.csv',
 'resting_2.csv',
 'resting_3.csv',
 'resting_4.csv',
 'resting_5.csv',
 'resting_6.csv',
 'resting_7.csv',
 'resting_8.csv',
 'resting_9.csv']

## Determine length of files in each folder

In [6]:
lengths = []
for i in range(7):
    name = root_dir + 'sample{}/'.format(i+1) + unique_files[0]
    cols = ['date', 'unit', 'value']
    df = pd.read_csv(name, header=None, names=cols)
    lengths.append(len(df))

In [7]:
lengths

[1737, 2062, 683, 1571, 2062, 2062, 2062]

## Have a graph for same file across 7 samples (Merged)

In [9]:
file = unique_files[1]
name = root_dir + 'sample{}/'.format(7) + file

In [None]:
df = pd.read_csv(name, header=None, names=cols)
idx = df.index[df['date'] == 'Date/Time'][0]
df = df.iloc[(idx+1):, :]
df = df.reset_index(drop=True)
df.value = df.value.astype(float)

In [None]:
df.head()

In [None]:
date = list(df.date.values)
date = list(map(lambda i: i.split(' ')[0], date))

In [None]:
print(beg)
print(end)

In [None]:
def get_month_yr_beg(s):
    return s['date'][0]

In [None]:
def get_month_yr_end(s):
    return s['date'][len(s)-1]

In [17]:
def format_date(x, pos=None):
    thisind = np.clip(int(x + 0.5), 0, N - 1)
    return temp[thisind]

error_plots = []

for j, file in enumerate(unique_files):
    
    print("Working with {}".format(file))
    names = []
    
    # create names
    for i in range(7):
        name = root_dir + 'sample{}/'.format(i+1) + file
        names.append(name)
    
    # create dataframe
    data = []
    cols = ['date', 'unit', 'value']
    dates = []
    for i, name in enumerate(names):
        try:
            # read in csv
            df = pd.read_csv(name, header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # grab dates
            date = list(df.date.values)
            date = list(map(lambda i: i.split(' ')[0], date))
            dates.append(date)
            # store
            data.append(df)
        except:
            print("\tFile did not exist in sample{}.".format(i+1))
            error_plots.append(file)
            zeros = pd.DataFrame(np.zeros((lengths[i], 3)))
            data.append(zeros)
            date = ['01/01/2999'] * lengths[i]
            dates.append(date)
            

    # concatenate into 1 big dataframe
    df = pd.concat(data)
    
    # create directory
    os_path = plot_dir + os.path.splitext(file)[0] + '/'
    if not os.path.exists(os_path):
        os.makedirs(os_path)
        
    # flatten dates
    dates = [item for sublist in dates for item in sublist]
    
    # plot
    fig, ax = plt.subplots()
    # x-axis
    temp = list(map(fix_dates, dates))
    N = len(temp)
    ind = np.arange(N)
    # y-axis
    y_vals = df.value
    # meta
    lbl = os.path.splitext(file)[0]
    ax.plot(ind, y_vals, label=lbl, alpha=0.9, linewidth=0.9, color='tab:red')
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
    plt.legend(loc='upper right')
    ax.set_xlabel('Time')
    ax.set_ylabel('Temperature [C]')
    ax.set_title(file)
    plt.grid(True)
    fig.autofmt_xdate()
    plt.savefig(os_path + '{}.eps'.format(lbl), format='eps', dpi=300)

Working with Fshl_iw_1.csv
Working with Fshl_iw_2.csv
	File did not exist in sample4.
	File did not exist in sample5.
	File did not exist in sample6.


  result = result.union(other)
  union = _union_indexes(indexes)


Working with Fshl_ow_1.csv
Working with Fshl_ow_2.csv
Working with Fshm_iw_1.csv
Working with Fshm_iw_2.csv
Working with Fshm_ow_1.csv
Working with Fshm_ow_2.csv
Working with Fshs_iw_1.csv
	File did not exist in sample2.
Working with Fshs_iw_2.csv
	File did not exist in sample2.
	File did not exist in sample7.
Working with Fshs_ow_1.csv
Working with Fshs_ow_2.csv
Working with Fsul_iw_1.csv
	File did not exist in sample6.
Working with Fsul_iw_2.csv
	File did not exist in sample6.
	File did not exist in sample7.
Working with Fsul_ow_1.csv
Working with Fsum_iw_1.csv
Working with Fsum_iw_2.csv
	File did not exist in sample2.
	File did not exist in sample6.
Working with Fsum_ow_1.csv
Working with Fsum_ow_2.csv
Working with Fsus_iw_1.csv
	File did not exist in sample4.
	File did not exist in sample5.
	File did not exist in sample6.
Working with Fsus_iw_2.csv
	File did not exist in sample4.
	File did not exist in sample5.
	File did not exist in sample6.




Working with Fsus_ow_1.csv
	File did not exist in sample4.
	File did not exist in sample5.
	File did not exist in sample6.
Working with Fsus_ow_2.csv
	File did not exist in sample4.
	File did not exist in sample5.
	File did not exist in sample6.
	File did not exist in sample7.
Working with Hshl_iw_1.csv
Working with Hshl_iw_2.csv
	File did not exist in sample3.
Working with Hshl_ow_1.csv
Working with Hshl_ow_2.csv
Working with Hshm_iw_1.csv
Working with Hshm_iw_2.csv
	File did not exist in sample3.
Working with Hshm_ow_1.csv
Working with Hshm_ow_2.csv
Working with Hshs_iw_1.csv
	File did not exist in sample6.
Working with Hshs_iw_2.csv
	File did not exist in sample2.
	File did not exist in sample6.
Working with Hshs_ow_1.csv
	File did not exist in sample6.
Working with Hshs_ow_2.csv
	File did not exist in sample5.
	File did not exist in sample6.
Working with resting_1.csv
Working with resting_10.csv
Working with resting_11.csv
Working with resting_2.csv
Working with resting_3.csv
Worki

## Have a graph for same file across 7 samples (Separate)

In [11]:
def format_date(x, pos=None):
    thisind = np.clip(int(x + 0.5), 0, N - 1)
    return temp[thisind]

error_plots = []

for j, file in enumerate(unique_files):
    
    print("Working with {}".format(file))
    names = []
    
    # create names
    for i in range(7):
        name = root_dir + 'sample{}/'.format(i+1) + file
        names.append(name)
    
    # create dataframe
    data = []
    cols = ['date', 'unit', 'value']
    dates = []
    for i, name in enumerate(names):
        try:
            # read in csv
            df = pd.read_csv(name, header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # grab dates
            date = list(df.date.values)
            date = list(map(lambda i: i.split(' ')[0], date))
            dates.append(date)
            # store
            data.append(df)
        except:
            print("\tFile did not exist in sample{}.".format(i+1))
            error_plots.append(file)
            date = ['01/01/2999'] * lengths[i]
            dates.append(date)
            data.append(pd.DataFrame())
            
    
    # create directory
    os_path = plot_dir + os.path.splitext(file)[0] + '/'
    if not os.path.exists(os_path):
        os.makedirs(os_path)
            
    # plot
    colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 
              'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
    
    for i in range(len(data)):
        if data[i].empty:
            print('plot error {}'.format(i+1))
            continue
        fig, ax = plt.subplots()
        # x-axis
        temp = list(map(fix_dates, dates[i]))
        N = len(temp)
        ind = np.arange(N)
        # y-axis
        y_vals = data[i].value
        # meta
        lbl = 'sample{}'.format(i+1)
        ax.plot(ind, y_vals, label=lbl, alpha=0.9, linewidth=0.9, color=colors[i])
        ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
        plt.legend(loc='upper right')
        ax.set_xlabel('Time')
        ax.set_ylabel('Temperature [C]')
        ax.set_title(file)
        plt.grid(True)
        fig.autofmt_xdate()
        plt.savefig(os_path + 'sample_{}.eps'.format(i+1), format='eps', dpi=300)

Working with Fshl_iw_1.csv




Working with Fshl_iw_2.csv
	File did not exist in sample4.
	File did not exist in sample5.
	File did not exist in sample6.
plot error 4
plot error 5
plot error 6
Working with Fshl_ow_1.csv
Working with Fshl_ow_2.csv
Working with Fshm_iw_1.csv
Working with Fshm_iw_2.csv
Working with Fshm_ow_1.csv
Working with Fshm_ow_2.csv
Working with Fshs_iw_1.csv
	File did not exist in sample2.
plot error 2
Working with Fshs_iw_2.csv
	File did not exist in sample2.
	File did not exist in sample7.
plot error 2
plot error 7
Working with Fshs_ow_1.csv
Working with Fshs_ow_2.csv
Working with Fsul_iw_1.csv
	File did not exist in sample6.
plot error 6
Working with Fsul_iw_2.csv
	File did not exist in sample6.
	File did not exist in sample7.
plot error 6
plot error 7
Working with Fsul_ow_1.csv
Working with Fsum_iw_1.csv
Working with Fsum_iw_2.csv
	File did not exist in sample2.
	File did not exist in sample6.
plot error 2
plot error 6
Working with Fsum_ow_1.csv
Working with Fsum_ow_2.csv
Working with Fsus_i

In [None]:
error_plots = list(set(error_plots))
error_plots

## Average of v1 and v2

In [8]:
# path params
data_dir = root_dir + 'sample1/'
pattern = '*.csv'

# crawl directory and grab filenames
names = []
for path, subdirs, files in os.walk(data_dir):
    for filename in files:
        if fnmatch.fnmatch(filename, pattern):
            names.append(os.path.join(path, filename))

# remove original files
names = [x for x in names if 'original' not in x and 'resting' not in x]
num_files = len(names)
print("\nThere are {} files.".format(num_files))


There are 35 files.


In [9]:
names[0:10]

['/Users/Farah/Desktop/Matteo/data/sample1/Fshl_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshl_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshl_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshl_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_iw_2.csv']

In [10]:
unique_files = [name[48:] for name in names]
unique_files[0:10]

['Fshl_iw_1.csv',
 'Fshl_iw_2.csv',
 'Fshl_ow_1.csv',
 'Fshl_ow_2.csv',
 'Fshm_iw_1.csv',
 'Fshm_iw_2.csv',
 'Fshm_ow_1.csv',
 'Fshm_ow_2.csv',
 'Fshs_iw_1.csv',
 'Fshs_iw_2.csv']

In [55]:
i = 8
name1 = unique_files[i]
name2 = unique_files[i+1]

names = []
# create names
for i in range(7):
    n1 = root_dir + 'sample{}/'.format(i+1) + name1
    n2 = root_dir + 'sample{}/'.format(i+1) + name2
    names.extend([n1, n2])
names

['/Users/Farah/Desktop/Matteo/data/sample1/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample2/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample2/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample3/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample3/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample4/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample4/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample5/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample5/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample7/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/data/sample7/Fshs_iw_2.csv']

In [61]:
def average_both(n1, n2):
    cols = ['date', 'unit', 'value']
    
    # create dataframes
    df1 = pd.read_csv(n1, header=None, names=cols)
    df2 = pd.read_csv(n2, header=None, names=cols)
    df1 = df1.copy()
    df2 = df2.copy()
    
    # preprocess
    idx = df1.index[df1['date'] == 'Date/Time'][0]
    df1 = df1.iloc[(idx+1):, :]
    df1 = df1.reset_index(drop=True)
    idx = df2.index[df2['date'] == 'Date/Time'][0]
    df2 = df2.iloc[(idx+1):, :]
    df2 = df2.reset_index(drop=True)
    df1.value = df1.value.astype(float)
    df2.value = df1.value.astype(float)
    
    # concat
    df1.value = pd.concat([df1.value, df2.value], axis=1).mean(axis=1)
    
    return df1

def take_one(n):
    cols = ['date', 'unit', 'value']
    
    # create dataframe
    df = pd.read_csv(n, header=None, names=cols)
    
    # preprocess
    idx = df.index[df['date'] == 'Date/Time'][0]
    df = df.iloc[(idx+1):, :]
    df = df.reset_index(drop=True)
    df.value = df.value.astype(float)

    return df

def nothing(i):
    zeros = pd.DataFrame(np.zeros((lengths[i], 3)))
    date = ['01/01/2999'] * lengths[i]
    return zeros, date

In [81]:
unique_files[0][:7]

'Fshl_iw'

In [87]:
unique_files

['Fshl_iw_1.csv',
 'Fshl_iw_2.csv',
 'Fshl_ow_1.csv',
 'Fshl_ow_2.csv',
 'Fshm_iw_1.csv',
 'Fshm_iw_2.csv',
 'Fshm_ow_1.csv',
 'Fshm_ow_2.csv',
 'Fshs_iw_1.csv',
 'Fshs_iw_2.csv',
 'Fshs_ow_1.csv',
 'Fshs_ow_2.csv',
 'Fsul_iw_1.csv',
 'Fsul_iw_2.csv',
 'Fsul_ow_1.csv',
 'Fsum_iw_1.csv',
 'Fsum_iw_2.csv',
 'Fsum_ow_1.csv',
 'Fsum_ow_2.csv',
 'Fsus_iw_1.csv',
 'Fsus_iw_2.csv',
 'Fsus_ow_1.csv',
 'Fsus_ow_2.csv',
 'Hshl_iw_1.csv',
 'Hshl_iw_2.csv',
 'Hshl_ow_1.csv',
 'Hshl_ow_2.csv',
 'Hshm_iw_1.csv',
 'Hshm_iw_2.csv',
 'Hshm_ow_1.csv',
 'Hshm_ow_2.csv',
 'Hshs_iw_1.csv',
 'Hshs_iw_2.csv',
 'Hshs_ow_1.csv',
 'Hshs_ow_2.csv']

In [89]:
for i in range(0, len(unique_files), 2):
    
    name1 = unique_files[i]
    name2 = unique_files[i+1]
    
    print('Working with {}'.format(name1[:7]))

    names = []
    for k in range(7):
        n1 = root_dir + 'sample{}/'.format(k+1) + name1
        n2 = root_dir + 'sample{}/'.format(k+1) + name2
        names.extend([n1, n2])

    data = []
    dates = []
    file = names[0][48:55]

    for j in range(0, len(names), 2):

        first = names[j]
        second = names[j+1]
        sample_num = int(first[46])

        print('\tWorking with sample {}'.format(first[46]))

        # check if it exists
        if os.path.exists(first):
            # check second
            if os.path.exists(second):
                print('\t\tAveraging...')
                # average both
                d = average_both(first, second)
                # grab dates
                date = list(d.date.values)
                date = list(map(lambda p: p.split(' ')[0], date))
                # store
                dates.append(date)
                data.append(d)
            else:
                # take the first
                print('\t\tTaking first...')
                d = take_one(first)
                # grab dates
                date = list(d.date.values)
                date = list(map(lambda p: p.split(' ')[0], date))
                # store
                dates.append(date)
                data.append(d)
        else:
            # check if the second exists
            if os.path.exists(second):
                # take the second
                print('\t\tTaking second...')
                d = take_one(second)
                # grab dates
                date = list(d.date.values)
                date = list(map(lambda p: p.split(' ')[0], date))
                # store
                dates.append(date)
                data.append(d)
            else:
                # take nothing and return an empty
                print('\t\t[*] Empty for both!...')
                d, date = nothing(sample_num)
                dates.append(date)
                data.append(d)
    
    # concatenate into 1 big dataframe
    print('\n')
    avg = pd.concat(data)

    # flatten dates
    dates = [item for sublist in dates for item in sublist]

    # plot
    plt.clf()
    fig, ax = plt.subplots()
    # x-axis
    temp = list(map(fix_dates, dates))
    N = len(temp)
    ind = np.arange(N)
    # y-axis
    y_vals = avg.value
    # meta
    ax.plot(ind, y_vals, label=file, alpha=0.9, linewidth=0.9, color='tab:red')
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
    plt.legend(loc='upper right')
    ax.set_xlabel('Time')
    ax.set_ylabel('Temperature [C]')
    ax.set_title(file)
    plt.grid(True)
    fig.autofmt_xdate()
    plt.savefig(plot_dir + 'averages/' + '{}.eps'.format(file), format='eps', dpi=300)

Working with Fshl_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Taking first...
	Working with sample 5
		Taking first...
	Working with sample 6
		Taking first...
	Working with sample 7
		Averaging...






Working with Fshl_ow
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Fshm_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Fshm_ow
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Fshs_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		[*] Empty for both!...
	Working with sample 3
		A

  union = _union_indexes(indexes)


Working with Fshs_ow
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Fsul_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		[*] Empty for both!...
	Working with sample 7
		Taking first...


Working with Fsul_ow
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Fsum_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		Taking second...
	Working with sampl

  result = result.union(other)


Working with Fsus_ow
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Taking second...
	Working with sample 5
		Taking second...
	Working with sample 6
		Taking second...
	Working with sample 7
		Taking second...


Working with Hshl_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Taking second...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Hshl_ow
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sample 3
		Averaging...
	Working with sample 4
		Averaging...
	Working with sample 5
		Averaging...
	Working with sample 6
		Averaging...
	Working with sample 7
		Averaging...


Working with Hshm_iw
	Working with sample 1
		Averaging...
	Working with sample 2
		Averaging...
	Working with sa

IndexError: list index out of range

In [76]:
def format_date(x, pos=None):
    thisind = np.clip(int(x + 0.5), 0, N - 1)
    return temp[thisind]

# concatenate into 1 big dataframe
avg = pd.concat(data)

# flatten dates
dates = [item for sublist in dates for item in sublist]

# plot
fig, ax = plt.subplots()
# x-axis
temp = list(map(fix_dates, dates))
N = len(temp)
ind = np.arange(N)
# y-axis
y_vals = avg.value
# meta
ax.plot(ind, y_vals, label=file, alpha=0.9, linewidth=0.9, color='tab:red')
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
plt.legend(loc='upper right')
ax.set_xlabel('Time')
ax.set_ylabel('Temperature [C]')
ax.set_title(file)
plt.grid(True)
fig.autofmt_xdate()
plt.savefig(plot_dir + 'averages/' + '{}.eps'.format(file), format='eps', dpi=300)

  union = _union_indexes(indexes)


In [None]:
skip = False
unique = unique_files[5]

print("Working with: {}".format(unique))

across_folders = [x for x in names if unique in x][1:]

data = []
dates = []
prev = 0
for i in range(len(across_folders)):

    if skip:
        skip = False
        continue

    cur = int(across_folders[i][48:57][-1])
    print("prev: {}".format(prev))
    print("cur: {}".format(cur))
    print("\tWorking with {}".format(across_folders[i][48:57]))
    
    # if last file
    if i == len(across_folders) - 1:
        print("\tLAST FILE: {}".format(across_folders[i][48:56][-1]))
        df = pd.read_csv(across_folders[i], header=None, names=cols)
        idx = df.index[df['date'] == 'Date/Time'][0]
        df = df.iloc[(idx+1):, :]
        df = df.reset_index(drop=True)
        df.value = df.value.astype(float)
        data.append(df)
    else:
        # if sample doesn't exist in both
        if cur != prev + 1:
            print("\t\tMissing in both: {}".format(prev+1))
            diff = cur - (prev + 1)
            for j in range(diff):
                data.append(pd.DataFrame())
                # date = ['01/01/2999'] * lengths[i]
                # dates.append(date)

        # if 2 files ==> average
        elif across_folders[i][48:55] == across_folders[i+1][48:55]:
            print("\t\tAverage!")
            skip = True
            cols = ['date', 'unit', 'value']
            df1 = pd.read_csv(across_folders[i], header=None, names=cols)
            df2 = pd.read_csv(across_folders[i+1], header=None, names=cols)
            df1 = df1.copy()
            df2 = df2.copy()
            idx = df1.index[df1['date'] == 'Date/Time'][0]
            df1 = df1.iloc[(idx+1):, :]
            df1 = df1.reset_index(drop=True)
            idx = df2.index[df2['date'] == 'Date/Time'][0]
            df2 = df2.iloc[(idx+1):, :]
            df2 = df2.reset_index(drop=True)
            df1.value = df1.value.astype(float)
            df2.value = df1.value.astype(float)
            df1.value = pd.concat([df1.value, df2.value], axis=1).mean(axis=1)
            data.append(df1)
            prev = 0

        # else 1 file ==> like before
        else:
            print("\t\tTaking from folder: {}".format(across_folders[i][48:57][-1]))
            # read in csv
            df = pd.read_csv(across_folders[i], header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # store
            data.append(df)
            # save name
            prev = 0

In [122]:
names = []
for i in range(7):
    data_dir = root_dir + 'sample{}/'.format(i+1)
    pattern = '*.csv'

    # crawl directory and grab filenames
    for path, subdirs, files in os.walk(data_dir):
        for filename in files:
            if fnmatch.fnmatch(filename, pattern):
                names.append(os.path.join(path, filename))
                
# remove original files
names = [x for x in names if 'original' not in x and 'resting' not in x]
num_files = len(names)
print("\nThere are {} files.".format(num_files))


There are 220 files.


In [123]:
# get unique files
unique_files = [name[40:47] for name in names]
unique_files = list(set(unique_files))
len(unique_files)

7

In [124]:
names

['/Users/Farah/Desktop/Matteo/data/sample1/Fshl_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshl_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshl_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshl_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshm_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fshs_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fsul_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fsul_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fsul_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fsum_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample1/Fsum_iw_2.csv

In [117]:
across_folders = [x for x in names if unique_files[5] in x][1:]

In [118]:
across_folders

['/Users/Farah/Desktop/Matteo/data/sample6/Fshl_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshl_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshl_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshm_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshm_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshm_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshm_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshs_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshs_iw_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshs_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fshs_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fsul_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fsul_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fsum_iw_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fsum_ow_1.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Fsum_ow_2.csv',
 '/Users/Farah/Desktop/Matteo/data/sample6/Hshl_iw_1.csv

In [113]:
across_folders[0][48:55]

'Fshl_iw'

In [121]:
skip = False
unique = unique_files[5]

print("Working with: {}".format(unique))

across_folders = [x for x in names if unique in x][1:]

data = []
dates = []
prev = 0
for i in range(len(across_folders)):

    if skip:
        skip = False
        continue

    cur = int(across_folders[i][48:57][-1])
    print("prev: {}".format(prev))
    print("cur: {}".format(cur))
    print("\tWorking with {}".format(across_folders[i][48:57]))
    
    # if last file
    if i == len(across_folders) - 1:
        print("\tLAST FILE: {}".format(across_folders[i][48:56][-1]))
        df = pd.read_csv(across_folders[i], header=None, names=cols)
        idx = df.index[df['date'] == 'Date/Time'][0]
        df = df.iloc[(idx+1):, :]
        df = df.reset_index(drop=True)
        df.value = df.value.astype(float)
        data.append(df)
    else:
        # if sample doesn't exist in both
        if cur != prev + 1:
            print("\t\tMissing in both: {}".format(prev+1))
            diff = cur - (prev + 1)
            for j in range(diff):
                data.append(pd.DataFrame())
                # date = ['01/01/2999'] * lengths[i]
                # dates.append(date)

        # if 2 files ==> average
        elif across_folders[i][48:55] == across_folders[i+1][48:55]:
            print("\t\tAverage!")
            skip = True
            cols = ['date', 'unit', 'value']
            df1 = pd.read_csv(across_folders[i], header=None, names=cols)
            df2 = pd.read_csv(across_folders[i+1], header=None, names=cols)
            df1 = df1.copy()
            df2 = df2.copy()
            idx = df1.index[df1['date'] == 'Date/Time'][0]
            df1 = df1.iloc[(idx+1):, :]
            df1 = df1.reset_index(drop=True)
            idx = df2.index[df2['date'] == 'Date/Time'][0]
            df2 = df2.iloc[(idx+1):, :]
            df2 = df2.reset_index(drop=True)
            df1.value = df1.value.astype(float)
            df2.value = df1.value.astype(float)
            df1.value = pd.concat([df1.value, df2.value], axis=1).mean(axis=1)
            data.append(df1)
            prev = 0

        # else 1 file ==> like before
        else:
            print("\t\tTaking from folder: {}".format(across_folders[i][48:57][-1]))
            # read in csv
            df = pd.read_csv(across_folders[i], header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # store
            data.append(df)
            # save name
            prev = 0

Working with: sample6
prev: 0
cur: 1
	Working with Fshl_iw_1
		Taking from folder: 1
prev: 0
cur: 1
	Working with Fshl_ow_1
		Average!
prev: 0
cur: 1
	Working with Fshm_iw_1
		Average!
prev: 0
cur: 1
	Working with Fshm_ow_1
		Average!
prev: 0
cur: 1
	Working with Fshs_iw_1
		Average!
prev: 0
cur: 1
	Working with Fshs_ow_1
		Average!
prev: 0
cur: 1
	Working with Fsul_ow_1
		Average!
prev: 0
cur: 1
	Working with Fsum_iw_1
		Taking from folder: 1
prev: 0
cur: 1
	Working with Fsum_ow_1
		Average!
prev: 0
cur: 1
	Working with Hshl_iw_1
		Average!
prev: 0
cur: 1
	Working with Hshl_ow_1
		Average!
prev: 0
cur: 1
	Working with Hshm_iw_1
		Average!
prev: 0
cur: 1
	Working with Hshm_ow_1
		Average!


In [29]:
skip = False

for j, unique in enumerate(unique_files):
    
    if j >= 1:
        break
        
    print("Working with {}: {}".format(j, unique))
    
    across_folders = [x for x in names if unique in x]
    
    data = []
    prev = 0
    for i in range(len(across_folders)):
    
        if skip:
            skip = False
            continue
        
        cur = int(across_folders[i][32:39][-1])
        print("\tWorking with {}".format(across_folders[i][32:39]))
        
        # if sample doesn't exist in both
        if cur != prev + 1:
            print("\tMissing in both: {}".format(prev+1))
            diff = cur - (prev + 1)
            for j in range(diff):
                data.append(pd.DataFrame())
        
        # if last file
        if i == len(across_folders)-1:
            print("\tLAST Taking from folder: {}".format(across_folders[i][40:49][-1]))
            # read in csv
            df = pd.read_csv(across_folders[i], header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # store
            data.append(df)
        
        # if 2 files ==> average
        elif across_folders[i][38] == across_folders[i+1][38]:
            print("\tAverage!")
            skip = True
            
            # create df
            cols = ['date', 'unit', 'value']

            # read in csv
            df1 = pd.read_csv(across_folders[i], header=None, names=cols)
            df2 = pd.read_csv(across_folders[i+1], header=None, names=cols)

            df1 = df1.copy()
            df2 = df2.copy()

            # remove metadata junk
            idx = df1.index[df1['date'] == 'Date/Time'][0]
            df1 = df1.iloc[(idx+1):, :]
            df1 = df1.reset_index(drop=True)

            idx = df2.index[df2['date'] == 'Date/Time'][0]
            df2 = df2.iloc[(idx+1):, :]
            df2 = df2.reset_index(drop=True)

            # fix column formats
            df1.value = df1.value.astype(float)
            df2.value = df1.value.astype(float)

            df1.value = pd.concat([df1.value, df2.value], axis=1).mean(axis=1)
            data.append(df1)
        
        # else 1 file ==> like before
        else:
            print("\tTaking from folder: {}".format(across_folders[i][40:49][-1]))
            # read in csv
            df = pd.read_csv(across_folders[i], header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # store
            data.append(df)
        
        # save name
        prev = cur
        
    # fix emptys   
    shit = []
    cumul = 0
    skipz = 0
    for i in range(len(data)):
        if skipz > 0:
            skipz -= 1
            continue
        if data[i].empty:
            print("{} EMPTY!!".format(i+1))
            for k in range(i, len(data)):
                if data[k].empty:
                    continue
                else:
                    break
            print("K: {}".format(k))
            print("cumul: {}".format(cumul))
            # poop = np.arange((i)*len(data[i+1]), (i+1)*len(data[i+1]))
            poop = np.arange(cumul, (k+1)*len(data[k]))
            y_vals = np.zeros(len(poop))
            y_vals[y_vals == 0] = np.NAN
            skipz = k - i
        else:
            y_vals = data[i].value
            cumul += len(y_vals)
        # poop = np.column_stack([x_vals, y_vals])
        shit.append(y_vals)
    # print(shit[5][0])
    y_s = np.concatenate(shit)
    print(len(y_s))
    x_s = np.arange(len(y_s))
    
    # create directory
    os_path = plot_dir + os.path.splitext(unique)[0] + '/'
    print(os_path)
    if not os.path.exists(os_path):
        os.makedirs(os_path)
    
    # plot
    fig, ax = plt.subplots()
    lbl = os.path.splitext(unique)[0] + '_avg'
    plt.plot(x_s, y_s, label=lbl, alpha=0.9, linewidth=0.9)
    plt.legend(loc='upper right')
    plt.xlabel('Time')
    plt.ylabel('Temperature [C]')
    plt.title(lbl)
    plt.savefig(os_path + '{}.eps'.format(lbl), format='eps', dpi=300)

Working with 0: sample2
	Working with sample2
	Missing in both: 1
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
	Working with sample2
	Missing in both: 3
	Average!
1 EMPTY!!
K: 1
cumul: 0
34816
/Users/Farah/Desktop/Matteo/plots/sample2/




## Combinations

In [None]:
combs = ['Fshl_iw', 'Fshm_iw', 'Fshs_iw']

In [None]:
names = []
for i in range(7):
    data_dir = root_dir + 'sample{}/'.format(i+1)
    pattern = '*.csv'

    # crawl directory and grab filenames
    for path, subdirs, files in os.walk(data_dir):
        for filename in files:
            if fnmatch.fnmatch(filename, pattern):
                names.append(os.path.join(path, filename))
                
# remove original files
names = [x for x in names if 'original' not in x and 'resting' not in x]
num_files = len(names)
print("\nThere are {} files.".format(num_files))

In [None]:
subset = []
for cb in combs:
    sub = [x for x in names if cb in x]
    subset.append(sub)
subset = [item for sublist in subset for item in sublist]

In [None]:
across_folders = [x for x in subset if combs[0] in x]
across_folders

In [None]:
len(subset)

In [None]:
skip = False
plots = []
for j, unique in enumerate(combs):
    print("Working with {}: {}".format(j, unique))
        
    across_folders = [x for x in subset if unique in x]
    
    data = []
    prev = 0
    for i in range(len(across_folders)):

        if skip:
            skip = False
            continue

        cur = int(across_folders[i][32:39][-1])
        print("\tWorking with {}".format(across_folders[i][32:39]))

        if cur != prev + 1:
            diff = cur - (prev + 1)
            for j in range(diff):
                data.append(pd.DataFrame())

        # if last file
        if i == len(across_folders)-1:
            # read in csv
            df = pd.read_csv(names[i], header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # store
            data.append(df)

        # if 2 files ==> average
        elif across_folders[i][38] == across_folders[i+1][38]:
            print("\tAverage!")
            skip = True

            # create df
            cols = ['date', 'unit', 'value']

            # read in csv
            df1 = pd.read_csv(names[i], header=None, names=cols)
            df2 = pd.read_csv(names[i+1], header=None, names=cols)

            df1 = df1.copy()
            df2 = df2.copy()

            # remove metadata junk
            idx = df1.index[df1['date'] == 'Date/Time'][0]
            df1 = df1.iloc[(idx+1):, :]
            df1 = df1.reset_index(drop=True)

            idx = df2.index[df2['date'] == 'Date/Time'][0]
            df2 = df2.iloc[(idx+1):, :]
            df2 = df2.reset_index(drop=True)

            # fix column formats
            df1.value = df1.value.astype(float)
            df2.value = df1.value.astype(float)

            df1.value = pd.concat([df1.value, df2.value], axis=1).mean(axis=1)
            data.append(df1)

        # else 1 file ==> like before
        else:
            # read in csv
            df = pd.read_csv(names[i], header=None, names=cols)
            # remove metadata junk
            idx = df.index[df['date'] == 'Date/Time'][0]
            df = df.iloc[(idx+1):, :]
            df = df.reset_index(drop=True)
            # fix column formats
            df.value = df.value.astype(float)
            # store
            data.append(df)

        # save name
        prev = cur

    # fix emptys   
    shit = []
    for i in range(len(data)):
        if data[i].empty:
            x_vals = np.arange((i)*len(data[i+1]), (i+1)*len(data[i+1]))
            y_vals = np.zeros(x_vals.shape)
            y_vals[y_vals == 0] = np.NAN
        else:
            x_vals = np.arange(i*len(data[i]), (i+1)*len(data[i]))
            y_vals = data[i].value
        poop = np.column_stack([x_vals, y_vals])
        shit.append(poop)
    shit = np.concatenate(shit)
    plots.append(shit)

In [None]:
len(plots)

In [None]:
plots[0].shape

In [None]:
plots[1].shape

In [None]:
plots[2].shape

In [None]:
fig, ax = plt.subplots()
for i in range(len(plots)):
    plt.plot(plots[i][:, 0], plots[i][:, 1], label=combs[i], alpha=0.3, linewidth=0.9)
plt.legend(loc='upper right')
plt.xlabel('Time')
plt.ylabel('Temperature [C]')
plt.savefig('/Users/Farah/Desktop/poop.eps', format='eps', dpi=300)