In [1]:
# Amount of classes according to sturges formula
import pandas as pd


def sturges_formula(series: pd.Series(dtype=object)):
    return round(1 + math.log2(len(series)))

In [None]:
def all_freq(x):
    t_abs=x.value_counts(dropna=False).sort_index()
    t_rel=(x.value_counts(dropna=False,normalize=True).sort_index()*100).round(1)
    t_abs_cum = x.value_counts(dropna=False).sort_index().cumsum()
    t_rel_cum = (x.value_counts(dropna=False,normalize=True).sort_index().cumsum()*100).round(1)
    return pd.DataFrame({'abs freq':t_abs,'rel freq:':t_rel , 'abs cum freq':t_abs_cum , 'rel cum freq':t_rel_cum})

In [None]:
def stats(x):
    result = pd.DataFrame()
    result['mode']=x.mode()
    result['mean']=x.mean()
    result['median']=x.median()
    return result

In [None]:
def get_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    I = Q3 - Q1
    low = Q1 - 1.5 * I
    high = Q3 + 1.5 * I
    return data[~data.between(low,high)]

def remove_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    I = Q3 - Q1
    low = Q1 - 1.5 * I
    high = Q3 + 1.5 * I
    return data[data.between(low,high)]

def get_extreme_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    I = Q3 - Q1
    low = Q1 - 3 * I
    high = Q3 + 3 * I
    return data[~data.between(low,high)]

In [None]:
def calc_vector_avg(array: np.array) -> float:
    total_frequencies = array[0] * array[1]
    return (total_frequencies.sum() / array[1].sum()).round(4)

In [None]:
def np_stats(array: np.array):
    total_frequencies = array[0] * array[1]
    mean = (total_frequencies.sum() / array[1].sum()).round(4)
    mode = array[0][array[1]==array[1].max()]
    median_pos = array[1].sum()/2
    pos_ceil=np.ceil(median_pos)
    pos_floor=np.floor(median_pos)
    all_nums=[]
    for x,f in zip(array[0],array[1]):
        i=0
        while i<f:
            all_nums.append(x)
            i=i+1
    if median_pos%2!=0:
        median = all_nums[int(median_pos)]
    else:
         median = [all_nums[int(pos_floor)],all_nums[int(pos_ceil)]]
    result={'mean':mean,'mode':mode.tolist(),'median':median}
    return result

In [None]:
def outlier_boundaries(series: pd.Series(dtype=float), extreme: bool = False) -> list:
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    if extreme:
        iqr *= 3
    else:
        iqr *= 1.5
    return [q1 - iqr, q3 + iqr]

In [None]:
def extend_array(array: np.array):
    long_boy=[]
    for x,f in zip(array[0],array[1]):
        i=0
        while i<f:
            long_boy.append(x)
            i=i+1
    return long_boy

In [None]:
def get_z_score(df: pd.DataFrame):
    return (df - df.mean())/df.std()

In [None]:
def mean_absolute_deviation(x):
    # or just use .mad(), whatever
    return abs(x-x.mean()).mean()

In [None]:
def plot_regressionline(reg_result, min, max, linecol="red", errorcol="#FFFF0080"):
    se = reg_result.se
    predict = reg_result.predict
    x = np.arange(min, max, (max-min)/100)
    y = predict(x)
    plt.fill_between(x, y-se, y+se, color=errorcol)
    plt.plot(x, y, color=linecol)

In [None]:
def general_regression(x, y, degree=1, exp=False):
    data = pd.DataFrame({'x':x,'y':y})
    data.reset_index(drop=True, inplace=True)
    func=lambda x:x # def func(x): return[x]
    inv_func=lambda x:x
    if (exp):
        func=np.exp
        inv_func=np.log
    sy = data.y.std()
    model = np.polyfit(x, inv_func(y), degree)
    line = np.poly1d(model)
    predict = lambda x:func(line(x))
    data['y_pred'] = pd.Series(predict(x))
    se = math.sqrt(((data.y_pred-data.y)**2).mean())
    R2 = 1-(se**2)/(sy**2)
    result = [se, R2, predict]
    index = ['se', 'R2', 'predict']
    for i in range(1,len(model)+1):
        result = np.append(result, model[-i])
        index += chr(i+96) # to obtain the characters a,b,...
    result = pd.Series(result)
    result.index = index
    return result

## REGRESSIONS

In [None]:
def linear_regression(x: pd.Series, y: pd.Series) -> list:
    model = np.polyfit(x, y, 1)
    predict = np.poly1d(model)
    xx = np.arange(x.min(), x.max(), (x.max() - x.min()) / 100)
    yy = predict(xx)
    y_pred = predict(x)
    se = math.sqrt(((y_pred - y) ** 2).mean())
    return [xx, yy, se]

In [None]:
def draw_linear_regression_scatter_plot(x: pd.Series, y: pd.Series, x_label: str, y_label: str, title: str) -> None:
    xx, yy, se = linear_regression(x, y)
    plt.scatter(x, y, alpha=0.2)
    plt.plot(xx, yy, color='red')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.fill_between(xx, yy - 2 * se, yy + 2 * se, color='#FFFF0020')
    plt.show()

In [None]:
def draw_boxplot(df: pd.DataFrame) -> plt.boxplot:
    """
    Normalizes the data in the DataFrame so the columns can be better compared in the boxplot. DataFrame columns passed into this function need to be numerical dtypes.
    :rtype: plt.boxplot
    :param df: DataFrame or part of one to be displayed in a boxplot.
    :return: boxplot instance
    """
    new_df: pd.DataFrame = pd.DataFrame()
    plt.figure()
    for column in df.columns.tolist():
        new_df[column] = stats.zscore(df[column])
    new_df.boxplot(new_df.columns.tolist())
    return plt.show()