# {{cookiecutter.project_name}}

{{cookiecutter.description}}

This notebook contains basic statistical analysis and visualization of the data.

## Data Sources
- summary : Processed file from notebook 2-Feature_engineering

## Changes
- {% now 'utc', '%Y-%m-%d' %} : Started project

In [None]:
from imports import *

In [None]:
%matplotlib inline

### File Locations

In [None]:
# File Locations
today = datetime.today()

INPUT_DIR = Path.cwd() / 'data'/ '01-input'
PROCESSED_DIR = Path.cwd() / 'data'/ '02-processed'
REPORT_DIR = Path.cwd() / 'reports'

INPUT_FILE = PROCESSED_DIR / f'enriched_{today:%Y-%m-%d}.feather'
REPORT_FILE = REPORT_DIR / f'Excel_Report_{today:%Y-%m-%d}.xlsx'

In [None]:
%%time
global df # Workaround against %%time bug. See: https://stackoverflow.com/questions/55341134/variable-scope-is-changed-in-consecutive-cells-using-time-in-jupyter-notebook

df = pd.read_feather(INPUT_FILE)

### Libraries initialization

Delete or comment-out unused libraries.

In [None]:
# Pandas init
# Set sizes of the tabular outputs
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Matplotlib init
# Set sizes of the plots
plt.rcParams['figure.figsize'] = [15, 7]

# TODO: Consider: Extract into settings/constants at the beginning of the file (or separate file).

In [None]:
# Cufflinks + plotly init
import cufflinks as cf
import plotly.offline as py

cf.go_offline()
cf.set_config_file(
    offline=True, 
    world_readable=False, 
    offline_show_link=False,
    offline_connected=False,
    # theme='pearl' # no effect?
    dimensions = (1024, 768)
)

cf_layout = cf.Layout(
    autosize=False,
    width=1024,
    height=600,
    margin=dict(
        l=175,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    #paper_bgcolor='#7f7f7f',
    #plot_bgcolor='#c7c7c7'
)

In [None]:
# Seaborn init

import seaborn as sns
sns.set() # switch to seaborn defaults

In [None]:
# TODO: consider Bokeh

### Perform Data Analysis

Define sample size, target variable and its possible values.<br/>
Sample is used for approximate analysis in time-consuming operations.

In [None]:
sample_size = 100000
df_sample = df.sample(sample_size)

target_variable = 'filedDispute'
# Careful with following line if the target_variable is continuous.
target_values = df_sample[target_variable].unique()

#### Stats

In [None]:
# Data preview
df.head(10)

In [None]:
# Data stats
describe(df)

In [None]:
# Statistics of missing values

null_counts = df.isnull().sum()
null_counts[null_counts>0].sort_values(ascending=False)

In [None]:
df['xxx'].nunique()

In [None]:
df['xxx'].unique()

In [None]:
df['xxx'].value_counts()

#### Vizualizations

##### Univariate

Histograms

In [None]:
%%time
histograms(df)

In [None]:
df['xxx'].hist(bins=50)

In [None]:
# Histogram for a bool column
df['bool_column'].astype('int64').hist(bins=2)

In [None]:
# Interactive histogram
df_sample['xxx'].iplot(kind="histogram", theme="white", title="xxx", xTitle='xxx', yTitle='Count')

Density plots

In [None]:
# Simple density plot
density_plot(df, 'xxx', bins=24)

In [None]:
# Density plot with highlighted classes
density_plot(df, 'xxx', target_variable, bins=24)

In [None]:
# Same plot with XKCD styling
with plt.xkcd():
    density_plot(df, 'xxx', target_variable, bins=24)

##### Correlation heatmap

In [None]:
corr = df.corr()

In [None]:
corr.iplot(kind='heatmap', colorscale="Spectral", title="Feature Correlation Matrix", layout=cf_layout)

Alternative heatmap drawn by seaborn.

In [None]:
%%time
fig, (ax) = plt.subplots(1, 1, figsize=(25, 17))

hm = sns.heatmap(
    corr, 
    ax=ax,           # Axes in which to draw the plot, otherwise use the currently-active Axes.
    cmap='coolwarm', # Color Map.
    #square=True,    # If True, set the Axes aspect to “equal” so each cell will be square-shaped.
    annot=True, 
    fmt='.2f',       # String formatting code to use when adding annotations.
    #annot_kws={"size": 14},
    linewidths=.05
)

fig.subplots_adjust(top=0.93)
fig.suptitle('Correlation Heatmap', 
              fontsize=12, 
              fontweight='bold')

In [None]:
# Focus on the single line in the matrix, representing target_variable
corr_target_df = corr[[target_variable]]

corr_target_df.iplot(
    kind='heatmap', colorscale='Spectral', title='Feature Correlation Matrix',
    layout=cf.Layout(
        autosize=False,
        width=1024,
        height=300,
        margin=dict(
            l=250,
            r=50,
            b=250,
            t=10,
            pad=4
        ),
    )
)

In [None]:
# Display correaltions with target_variable numerically, sorted.

corr_target_df.sort_values(target_variable)

In [None]:
# Inspect features which correlate with target_variable with strength above selected threshold
corr_threshold = 0.02

corr_target = corr[target_variable]
corr_over_thr = corr_target[corr_target.abs()>corr_threshold]
corr_over_thr

##### Pairplot, catplot

Use manually selected columns...

In [None]:
selected_columns = [
    'xxx', 
    'xxx', 
    'xxx', 
    'xxx'
]

... or those with strong enough correlation:

In [None]:
selected_columns = list(corr_over_thr.index.values)

In [None]:
df_selected = df[selected_columns]

In [None]:
%%time
sns.pairplot(
    df_selected,
    hue=target_variable, 
    height=2,
    aspect=1.5,
    plot_kws={'alpha': 0.4, 'edgecolor': 'k', 'linewidth': 0.5},
    diag_kws={'shade': True},
    diag_kind='kde'
)

Zoom at the diagonale:

In [None]:
for var in selected_columns:
    if var != target_variable:
        plt.figure(num=1, figsize=(18, 3))
        density_plot(df_selected, var, target_variable, bins=50)
    plt.show()

##### Scatterplot with highlighted classes

In [None]:
var1 = 'var1'
var2 = 'var2'

In [None]:
scatter_colored(
    df_sample, 
    var1, var2, 
    target_variable
)

In [None]:
sns.relplot(
    x=var1, y=var2, 
    hue=target_variable, 
    data=df_sample, 
    alpha=0.3, 
    size=target_variable, sizes={False:15, True:100}, height=10, aspect=1
    #col='group_by'
)

##### Boxplot

Display variation of selected variable `var` in groups defined by selected class `class1`, sorted by mean value.

In [None]:
var1 = 'class_var1'
var2 = 'var2'

In [None]:
grouped = df_sample.groupby(var1)
var2_averages = pd.DataFrame({col:vals[var2] for col,vals in grouped}).mean().sort_values(ascending=True)

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(    
    data=dfs_selected, 
    x=var1, 
    y=var2, 
    order=var2_averages.index
);

Same information displayed by violin plot. Provides more detailed insight into distribution of the second variable.

In [None]:
plt.figure(figsize=(20, 10))
sns.violinplot(    
    data=df_sample, 
    x=var1, 
    y=var2, 
    order=var2_averages.index
);

Boxplot with boxes split by target variable. 

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(    
    data=df_sample, 
    x=var1, 
    y=var2, 
    order=var2_averages.index,
    hue=target_variable
);

Violin plot with boxes split by target variable. Gives separate distributions for different values of target variable. Presumably works only for two-value target variable.

In [None]:
plt.figure(figsize=(20, 10))
sns.violinplot(    
    data=df_sample.query('initialAmount < 50'), 
    x=var1, 
    y=var2, 
    order=var2_averages.index,
    hue=target_variable,
    split=True
);

### Save Excel file into reports directory

Save an Excel file with intermediate results into the report directory

In [None]:
writer = pd.ExcelWriter(REPORT_FILE, engine='xlsxwriter')

In [None]:
df.to_excel(writer, sheet_name='Report')

In [None]:
writer.save()

## THE END

---
<br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/>

In [None]:
raise(Exception("THE END"))

Place unused vizualizations and experiments below this line.

---