# Definíciók

## Importok

In [None]:
## to manipulate data
import pandas as pd
import numpy as np
import re

## for data analysis
#from collections import Counter
#import scipy.stats as st
#import statsmodels.api as sm
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
#from sklearn.decomposition import PCA
#from sklearn import linear_model
#from sklearn.cluster import DBSCAN, AgglomerativeClustering
#from sklearn import neighbors

## for visualisation
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from bokeh.io import output_file, output_notebook, show
from bokeh.models import BasicTicker, ColorBar, LinearColorMapper, ColumnDataSource, PrintfTickFormatter
from bokeh.plotting import figure
from bokeh.transform import transform

%matplotlib inline
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

In [1]:
#from IPython.display import HTML

In [None]:
import geopandas as gpd
import os
import requests
from io import BytesIO
import zipfile

In [None]:
import unicodedata

def strip_accents(text):
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    
    Source: https://stackoverflow.com/a/31607735
    """
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

## Elemzés

In [None]:
def colour_cm(df, matrixtitle = "Korrelációs mátrix", pwidth = 1, pheight = 1):
    '''Takes a pandas correlation dataframe and visualises it with hover tools.'''
    
    coln = len(df)
    
    # Since bokeh uses bottom left corner as origin, we need to reverse the order of column names in the index to get a correlation matrix with the usual orientation (1 along major diagonal)
    # also, bokeh does not display NaN values, so we change them to 0
    df = df.loc[::-1,:].fillna(0)
    
    # Prepare dataframe in the right format
    df.index.name = 'AllColumnsRow'
    df.columns.name = 'AllColumnsCol'
    df = df.stack().rename("value").reset_index()
    
    # the plot output:
    output_notebook()
    
    # Bokeh doesn't have its own gradient color maps supported but we can use one from matplotlib.
    colormap = mpl.cm.jet
    bokehpalette = [mpl.colors.rgb2hex(m) for m in colormap(np.arange(colormap.N))]
    #this mapper is what transposes a numerical value to a color. 
    mapper = LinearColorMapper(palette=bokehpalette, low=-1, high=1)
    
    # Define a figure and tools and the tooltip
    
    TOOLTIPS = [
        ("sor", "@AllColumnsRow"),
        ("oszlop", "@AllColumnsCol"),
        ("r", "@value"),
    ]
    
    TOOLS = "pan,wheel_zoom,box_zoom,save,reset"
    
    p = figure(
        tools=TOOLS,
        #plot_width=1400,
        #plot_height=1200,
        plot_width=int(pwidth*60*coln),
        plot_height=int(pheight*50*coln),
        title=matrixtitle,
        x_range=list(df.AllColumnsCol.drop_duplicates()),
        y_range=list(df.AllColumnsRow.drop_duplicates()),
        toolbar_location="right",
        x_axis_location="below",
        tooltips=TOOLTIPS,
        #sizing_mode='stretch_both',
    )
    
    p.xaxis.major_label_orientation = np.pi/3
    p.xaxis.major_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'

    
    # Create rectangle for heatmap
    p.rect(
        x="AllColumnsCol",
        y="AllColumnsRow",
        width=1,
        height=1,
        source=ColumnDataSource(df),
        line_color=None,
        fill_color=transform('value', mapper))
    
    # Add legend
    color_bar = ColorBar(
        color_mapper=mapper,
        location=(0, 0),
        ticker=BasicTicker(desired_num_ticks=10))
    
    p.add_layout(color_bar, 'right')
    
    show(p)

In [None]:
def corr_sorted(corr):
	#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
    return (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
                 .stack()
                 .sort_values(ascending=False))
                 
def corr_sorted_val(corr, threshold=0.25, col="r"):
    sol = corr_sorted(corr)
    return pd.DataFrame(sol.where(abs(sol) > threshold).dropna(), columns=[col])


In [None]:
def two_sample_ztest(s1, s2):
    '''Performs two-sided z-test on two series to test if their means difference is significant.'''
    if len(s1) < 30 or len(s2) < 30:
        print('Caution, data smaller than 30! Check normality or use t-test.')
    z = (s1.mean() - s2.mean()) / np.sqrt((s1.std()**2)/len(s1) + (s2.std()**2)/len(s2))
    if z < 0: # in the left tail
        p = st.norm.cdf(z) * 2
    else: # in the right tail
        p = (1 - st.norm.cdf(z) ) * 2
    return (z, p)

