In [None]:
%matplotlib notebook 
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import scipy
from scipy import stats

In [None]:
data2018 = pd.read_csv('C:\\Users\\Francis\\Google Drive\\Data\\___PACE\\STEM-Summer2018\\2018expeditions.csv', header=0)

In [None]:
# look at the first five rows
data2018.head()

In [None]:
# Display the column names
data2018.columns

In [None]:
# look at the last five rows
data2018.tail()

In [None]:
# transpose the data -- interchange rows and columns
data2018 = data2018.T

In [None]:
data2018.head()

In [None]:
data2018.iloc[0]

In [None]:
# change the column names to the names in the first row  instead of 1, 2, ... 
data2018.columns = data2018.iloc[0]

In [None]:
data2018.head()

In [None]:
# since the first row of the transposed data contains the variable names and not data, drop the first row
data2018 = data2018.drop(data2018.index[0])

In [None]:
# take a look and make sure you got what you wanted
data2018.head()

In [None]:
data2018.shape

In [None]:
# you can print all the variable names
for i in range(len(data2018.columns)):
    print(data2018.columns[i])

In [None]:
# you can explore specific subsets of the data
# "Oil sheen present? (1=Yes,0=No)" is the second column so we can extract it via implicit indexing
# Remember  Python indexing is zero based
for i in range(3):
    print(data2018.iloc[i, 1])

In [None]:
# Display the measuremts for all substrates for the second expedition
data2018.iloc[1]["Measurements (mm)"]

In [None]:
# You can select multiple columns by using an array as the index
data2018[["Measurements (mm)", "Total number of all live oysters"]]

In [None]:
# Extract the values for each substrate shell for one expedition
# Recall the expeditions are the rows so we use .iloc to pass the index location (row number)
# This will create an array of strings, each string contains the measurements
# vals is of type numpy.ndarray, and each element of vals is a str
vals = data2018.iloc[0]["Measurements (mm)"].values

In [None]:
# what is the dtype for vals
type(vals)

In [None]:
# what is the dtype for the first element of vals
# We see vals is a list of strings
type(vals[0])

In [None]:
vals # we see a mix of str and nan

In [None]:
type(vals[0].split(","))

In [None]:
# we can't do analysis yet because this is a list of str 
# We break out the individual values for the first strings but each is still a str
# And we have to deal with nan 
# If we try to split() a nan we get an error
vals[0].split(",")

In [None]:
# This throws an error because vals[1] is nan and nan's are floats not str
vals[1].split(",")

In [None]:
# Pandas Series and DataFrames can use the dropna() method to remove nan's
pd.Series(vals).dropna()

In [None]:
# And to_numeric() converts strings to floats
pd.to_numeric(pd.Series(vals[0].split(",")))

In [None]:
for v in range(len(vals)):  # let's walk through the string array 
    if type(vals[v]) == str: # if the type is str then we can split() and make it a float
        print(pd.to_numeric(pd.Series(vals).dropna()[v].split(",")))
        # we can also print the mean for each substrate
        print('Substrate Shell #' + str(v+1) + ": \t The mean is ", 
              '%.3f' % pd.to_numeric(pd.Series(vals).dropna()[v].split(",")).mean(), sep='\t')

In [None]:
# Save the column name to a simple variable rather than retype this whole thing
wc = 'Water color\r\n(1=Light Blue,2=Dark Blue,3=Light Green,\r\n4=Dark Green,5=Light Brown,6=Dark Brown)'

In [None]:
print(wc)

In [None]:
# This stacks the measurements based on water color
v1 = np.array([])
v2 = np.array([])
v3 = np.array([])
v4 = np.array([])
v5 = np.array([])
v6 = np.array([])
for l in range(len(data2018["Measurements (mm)"].values)):
    # temp stores the observations for one expedition
    for t in range(10):
        if pd.notnull(data2018.iloc[l]['Measurements (mm)'].values[t]):
            w = np.array(data2018.iloc[l]["Measurements (mm)"].values[t].split(",")).astype(float)
            if data2018.iloc[l][wc] == '1':
                v1 = np.append(v1, w)
            elif data2018.iloc[l][wc] == '2':
                v2 = np.append(v2, w)
            elif data2018.iloc[l][wc] == '3':
                v3 = np.append(v3, w)
            elif data2018.iloc[l][wc] == '4':
                v4 = np.append(v4, w)
            elif data2018.iloc[l][wc] == '5':
                v5 = np.append(v5, w)
            else:
                v6 = np.append(v6, w)
        else:
            next

In [None]:
# Combine the measurements with water color into a new data frame
measures = np.concatenate((v1, v2, v3, v4, v5, v6), axis=0)
measures = pd.DataFrame({'Measures': measures})

measures['WaterColor'] = np.zeros(len(measures['Measures']))

In [None]:
ind = np.array([len(v1), len(v2), len(v3), len(v4),len(v5),len(v6)])
ind

cumind = np.cumsum(ind)

In [None]:
for i in range(cumind[0]):
    measures.loc[i, 'WaterColor'] = 1
for j in range(5):
    for i in range(cumind[j],cumind[j+1]):
        measures.loc[i, 'WaterColor'] = j+2

In [None]:
measures.head()

In [None]:
# Some measurements are 99999 to indicate missing values! WHY???!!!
# So get rid of them; measures has all the data excluding those observations where 'Measures' is 99999
measures = measures[measures['Measures']!= 99999]

In [None]:
# Calculate the mean for each color
measures.groupby('WaterColor').mean()

In [None]:
measures.groupby('WaterColor').size()

In [None]:
measures['Measures'].plot.hist()

In [None]:
measures.groupby('WaterColor').mean().plot.bar()

In [None]:
measures.groupby('WaterColor').size().plot.bar()

In [None]:
# In this example, I made up a second variable to plot against my measurements data
# I added random normal fluctuations to the measurements data (i.e., random noise)
measures['Measures + Noise'] = measures['Measures'] + np.random.normal(0,4,627)

In [None]:
measures.plot.scatter(y = 'Measures + Noise', x = 'Measures')

In [None]:
measures['Measures'].plot()

In [None]:
rn = np.array([[np.random.normal(0,1,100)],[np.random.normal(0.5,0.8,100)]])
plt.scatter(rn[0], rn[1])
plt.title('Zero Correlation')
plt.show()

In [None]:
w = np.random.normal(0,1,100)
rn = np.array([w,w+2*np.random.normal(0,0.1,100)])
plt.scatter(rn[0], rn[1])
plt.title('Positive Correlation')
plt.show()

In [None]:
w = np.random.normal(0,1,100)
rn = np.array([w, -w+3*np.random.normal(0.5,0.1,100)])
plt.scatter(rn[0], rn[1])
plt.title('Negative Correlation')
plt.show()

In [None]:
measures['Measures'].plot.box()

In [None]:
measures.head()

In [None]:
np.corrcoef(measures['Measures'], measures['Measures + Noise'])

In [None]:
# The p-value near zero indicates the correlation is statistically significant
rho, pstat = scipy.stats.pearsonr(measures['Measures'], measures['Measures + Noise'])
print('rho = %.4f' % rho, 'p-val = %.4f' % pstat)

In [None]:
# We're testing the hypothesis that the two sample means are equal (statistically speaking)
tstat, pval = scipy.stats.ttest_ind(measures['Measures'], measures['Measures + Noise'])
print('t-stat = %.3f ' % tstat, 'p-value = %.3f' % pval)

In [None]:
# Test if the mean measurements in two water colors are the same --- they're not; low p-value
tstat, pval = scipy.stats.ttest_ind(v1, v5)
print('t-stat = %.3f ' % tstat, 'p-value = %.3f' % pval)

In [None]:
# If you want to test if there is a difference in the means from multiple groups
# use Analysis of Variance(ANOVA) 
fstat, pval = stats.f_oneway(v1, v3, v4, v5, v6)
print('ANOVA Results', 'fstat = %.3f' % fstat, 'p-value = %.3f' % pval, '\n', sep='\t')

# The null nypothesis for ANOVA is all means are equal
# If your test suggests rejecting the null you need to do a Tukey HSD comparison
# to see which pairs of means are different
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print(pairwise_tukeyhsd(measures['Measures'], measures['WaterColor']))

In [None]:
# fixing the pH data issue for out of bounds data -- pH ranges from 0 to 14 
# note the original list is a list of star coming out of the BOP data
phs = pd.Series(['17.6', '8.2', '7.5'])
phsf = phs.astype(float)

In [None]:
phsf[phsf > 14.0] = np.nan
phsf

In [None]:
phsf.mean()