# {{cookiecutter.project_name}}

{{cookiecutter.description}}

This notebook contains basic statistical analysis and visualization of the data.

## Data Sources
- summary : Processed file from notebook 1-Data_Prep

## Changes
- {% now 'utc', '%Y-%m-%d' %} : Started project

In [None]:
from imports import *

In [None]:
%matplotlib inline

### File Locations

In [None]:
# File Locations
today = datetime.today()

INPUT_DIR = Path.cwd() / 'data'/ '01-input'
PROCESSED_DIR = Path.cwd() / 'data'/ '02-processed'
REPORT_DIR = Path.cwd() / 'reports'

INPUT_FILE = PROCESSED_DIR / f'cleaned_{today:%Y-%m-%d}.feather'
REPORT_FILE = REPORT_DIR / f'Excel_Report_{today:%Y-%m-%d}.xlsx'

In [None]:
df = pd.read_feather(INPUT_FILE)

### Libraries initialization

Delete or comment-out unused libraries.

In [None]:
# Pandas init
# Set sizes of the tabular outputs
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Matplotlib init
# Set sizes of the plots
plt.rcParams['figure.figsize'] = [15, 7]

# TODO: Consider: Extract into settings/constants at the beginning of the file (or separate file).

In [None]:
# Cufflinks + plotly init
import cufflinks as cf
import plotly.offline as py

cf.go_offline()
cf.set_config_file(
    offline=True, 
    world_readable=False, 
    offline_show_link=False,
    offline_connected=False,
    # theme='pearl' # no effect?
    dimensions = (1024, 768)
)

cf_layout = cf.Layout(
    autosize=False,
    width=1024,
    height=600,
    margin=dict(
        l=175,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    #paper_bgcolor='#7f7f7f',
    #plot_bgcolor='#c7c7c7'
)

In [None]:
# Seaborn init

import seaborn as sns

In [None]:
# TODO: consider Bokeh

### Perform Data Analysis

#### Stats

In [None]:
# Data preview
df.head(10)

In [None]:
# Data stats
describe(df)

In [None]:
df['...'].nunique()

In [None]:
df['...'].unique()

#### Vizualizations

##### Histograms

In [None]:
df['...'].hist(bins=50)

In [None]:
# Interactive histogram
df['...'].iplot(kind="histogram", theme="white", title="...", xTitle='...', yTitle='Count')

##### Correlation heatmap

In [None]:
corr = df.corr()

In [None]:
corr.iplot(kind='heatmap', colorscale="Spectral", title="Feature Correlation Matrix", layout=cf_layout)

##### Pairplot, catplot

In [None]:
df_selected = df[[
    # selected columns
    '...', 
    '...', 
    '...', 
    '...'
]]

In [None]:
sns.pairplot(
    df_selected,
    hue='...', # class (/target) variable
    height=2,
    aspect=1.5
)

##### Scatterplot with highlighted classes

In [None]:
sns.catplot(
    data=df_selected, 
    x="var1", 
    y="var2", 
    hue='class1',
    height=10
);

##### Boxplot

Display variation of selected variable `var` in groups defined by selected class `class1`, sorted.

In [None]:
grouped = dfs_selected.groupby("class1")
enfpoint_speed_average = pd.DataFrame({col:vals['var1'] for col,vals in grouped}).mean().sort_values(ascending=True)

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(    
    data=dfs_selected, 
    x="class1", 
    y="var1", 
    order=enfpoint_speed_average.index
);

### Save Excel file into reports directory

Save an Excel file with intermediate results into the report directory

In [None]:
writer = pd.ExcelWriter(REPORT_FILE, engine='xlsxwriter')

In [None]:
df.to_excel(writer, sheet_name='Report')

In [None]:
writer.save()