# Perform statistical analyses

# Document

<table align="left">
    <tr>
        <th class="text-align:left">Title</th>
        <td class="text-align:left">Perform statistical analyses</td>
    </tr>
    <tr>
        <th class="text-align:left">Last modified</th>
        <td class="text-align:left">2019-01-09</td>
    </tr>
    <tr>
        <th class="text-align:left">Author</th>
        <td class="text-align:left">Gilles Pilon <gillespilon13@gmail.com></td>
    </tr>
    <tr>
        <th class="text-align:left">Status</th>
        <td class="text-align:left">Active</td>
    </tr>
    <tr>
        <th class="text-align:left">Type</th>
        <td class="text-align:left">Jupyter notebook</td>
    </tr>
    <tr>
        <th class="text-align:left">Created</th>
        <td class="text-align:left">2018-12-21</td>
    </tr>
    <tr>
        <th class="text-align:left">File name</th>
        <td class="text-align:left">N/A</td>
    </tr>
    <tr>
        <th class="text-align:left">Other files required</th>
        <td class="text-align:left">N/A</td>
    </tr>
</table>

# Ideas

- Built-in statistics.
- Parametric statistics.
- Non-parametric statistics.
- Simple linear regression.

In [None]:
import pandas as pd

In [None]:
# Read a csv file. This file is raw, not munged.
FILE_TO_READ = 'thirteen_weeks.csv'
df = pd.read_csv(FILE_TO_READ,
                 parse_dates=True,
                 index_col='Time')

In [None]:
df.dtypes

In [None]:
# For a single numeric column.
df['Water Load (lb/MSF)'].describe()

In [None]:
# For a single object column.
df['Trim Board Density (lb/cft)'].describe()

In [None]:
# For all columns in a dataframe. Only numeric fields are returned.
df.describe()

In [None]:
# Describe all columns regardless of data type.
df.describe(include='all')

In [None]:
# Exclude all strings in any object column.
import numpy as np
df.describe(exclude=[np.object])

In [None]:
# Read a munged csv file.
FILE_TO_READ = 'thirteen_weeks_munged.csv'
df = pd.read_csv(FILE_TO_READ,
                 parse_dates=True,
                 index_col='Time')

In [None]:
# This results in the same as the exclude object code above.
pd.set_option('display.max_columns', 500)
df.describe()

In [None]:
# Find specific parametric statistics.
# Average of each column.
df.mean()

In [None]:
# Standard deviation of each column.
df.std()

In [None]:
# Range of each column
df.max() - df.min()

In [None]:
# Find specific nonparametric statistics.
df.median()

In [None]:
df.quantile(.5) # median

In [None]:
# Find the interquartile range.
df.quantile(0.75) - df.quantile(0.25)

In [None]:
# Compare the interquartile range to the standard deviation.
print((df.quantile(.75) - df.quantile(.25)), df.std())

In [None]:
import datasense as ds
ds.parametric_summary(df['Water Load (lb/MSF)'])

In [None]:
for column_name in df.columns:
    print(column_name, '\n', ds.parametric_summary(df[column_name]))

In [None]:
for column_name in df.columns:
    print(column_name, '\n', ds.nonparametric_summary(df[column_name]))