# General EDA 

In [None]:
# Imports
%matplotlib inline
from ast import literal_eval
from collections import OrderedDict
from scipy.stats import zscore
from typing import List
import csv
import datetime
import math
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import numpy as np
import pandas as pd
import pymongo
import robustats
import seaborn as sns
import wquantiles as wq
import pprint
from scipy.stats import chi2, mstats, mannwhitneyu, kruskal, chi2_contingency
from statsmodels.stats.multitest import multipletests
from cliffsDelta import cliffsDelta
pp = pprint.PrettyPrinter(indent=4)

# Jupyter configurations
pd.options.display.max_columns = 20002
pd.options.display.max_columns = 10000
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

In [None]:
# Set palette

seq_col_brew = sns.color_palette("Greys_r", 5)
sns.set_palette(seq_col_brew)
seq_col_brew
#en orden alfabetico, [1] para br, [2] para co, [3] para de, [4] para us

In [None]:
all_apps = pd.read_csv(
   'data_LOCF_fixed.csv', delimiter="|", encoding='utf-8', engine= 'python',converters={'_id':str})

In [None]:
all_apps.head(3)

In [None]:
# Original types
all_apps.dtypes

In [None]:
# Transform variables to date
vars_dates = ['retrieved_date_start','retrieved_date_end', 'last_update', 'last_update_fixed']
for var in vars_dates:
    all_apps[var] = pd.to_datetime(all_apps[var])

In [None]:
# Transformed types
all_apps.dtypes

# General description of numeric and categorical variables

In [None]:
# Describe length of rows and columns
all_apps.shape

In [None]:
# Describe numerical variables for all the dataset
all_apps.describe()

In [None]:
# Describe categorical variables
all_apps.describe(include=['object','datetime', 'bool'])

In [None]:
# Nan or missing values for all dataset
missing_data = all_apps.isnull()
total = missing_data.sum().sort_values(ascending=False)
percent = (missing_data.sum()/missing_data.count()).sort_values(ascending=False)
summary_missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
summary_missing_data['Missing Percent'] = summary_missing_data['Missing Percent'].apply(lambda x: x * 100)
summary_missing_data

## Different types of values per categorical variables

In [None]:
def get_number_values_categorical_values(df, column_name:str, high:List[str],low:List[str]):
    print("-/-"*5)
    print(f"Column: {column_name}")
    
    try:
        column = df[column_name]
        number_values = column.nunique()

    except:
        #This catch is useful when genre is list type
        column = df[column_name].astype(str)
        number_values = column.nunique()
        
    print(f"Number of different values {number_values}")
    
    if number_values < 100 and "retrieved" not in column_name:
        print("\n")
        print("Values:")
        print(f"{column.unique()}")
        print("\n")
        
        low.append(column_name)
    else:
        high.append(column_name)
    print("-/-"*5)

high_cardinality = []
low_cardinality = []
columns_categorical = all_apps.select_dtypes(include=['object','datetime', 'bool']).columns

for column in columns_categorical:
    get_number_values_categorical_values(all_apps,column,high_cardinality,low_cardinality)

print(len(high_cardinality), len(low_cardinality))

# Subset of selected variables

In [None]:
two_weeks = {
    1:1,2:1,3:2,4:2,5:3,6:3,7:4,8:4,
    9:5,10:5,11:6,12:6,13:7,14:7,15:8,16:8,
    17:9,18:9,19:10,20:10,21:11,22:11,23:12,24:12,
    25:13,26:13,27:14,28:14,29:15,30:15,
}

three_weeks = {
    1:1,2:1,3:1,4:2,5:2,6:2,7:3,8:3,9:3,10:4,11:4,12:4,
    13:5,14:5,15:5,16:6,17:6,18:6,19:7,20:7,21:7,
    22:8,23:8,24:8,25:9,26:9,27:9,28:10,29:10,30:10,
}

five_weeks = {
    1:1,2:1,3:1,4:1,5:1,6:2,7:2,8:2,9:2,10:2,
    11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,
    21:5,22:5,23:5,24:5,25:5,26:6,27:6,28:6,29:6,30:6,
}

six_weeks = {
    1:1,2:1,3:1,4:1,5:1,6:1,7:2,8:2,9:2,10:2,11:2,12:2,
    13:3,14:3,15:3,16:3,17:3,18:3,19:4,20:4,21:4,22:4,23:4,24:4,
    25:5,26:5,27:5,28:5,29:5,30:5,
}

ten_weeks = {
    1:1,2:1,3:1,4:1,5:1,6:1,7:1,8:1,9:1,10:1,
    11:2,12:2,13:2,14:2,15:2,16:2,17:2,18:2,19:2,20:2,
    21:3,22:3,23:3,24:3,25:3,26:3,27:3,28:3,29:3,30:3,
}


def define_weeks(num_week, time):
    if '2_week'== time:
        return two_weeks[num_week]
    elif '3_week' == time:
        return three_weeks[num_week]
    elif '5_week' == time:
        return five_weeks[num_week]
    elif '6_week' == time:
        return six_weeks[num_week]
    elif '10_week' == time:
        return ten_weeks[num_week]
    elif '15_week' == time:
        return 1 if num_week<16 else 2 

In [None]:
# Length of name, summary and description
all_apps['len_name'] = all_apps['name'].str.len()
all_apps['len_summary'] = all_apps['summary'].str.len()
all_apps['len_description'] = all_apps['description'].str.len()

# Transform num installs from string to number
fixed_num = all_apps[['fixed_num_installs']]
fixed_num['fixed_num_installs_num'] = fixed_num['fixed_num_installs'].str.replace(',','')
fixed_num['fixed_num_installs_num'] = fixed_num['fixed_num_installs_num'].str.replace('+','')
fixed_num['fixed_num_installs_num'] = pd.to_numeric(fixed_num['fixed_num_installs_num'])

all_apps['fixed_num_installs_num'] = fixed_num['fixed_num_installs_num']

times = ['2_week', '3_week', '5_week', '6_week', '10_week', '15_week']
for t in times:
    all_apps[t] = all_apps['num_week'].apply(lambda x: define_weeks(x, t))

only_vars = all_apps[['id', 'len_name', 'len_summary', 'len_description', 
                      'last_update_fixed', 'content_rating', 'rating', 'rating_1','rating_2', 
                      'rating_3', 'rating_4', 'rating_5', 'fixed_num_installs', 
                      'fixed_num_installs_num', 'has_whats_new', 'last_update_days_fixed', 
                      'macro_android_version', 'num_week', 'price', 'price_usd', 'unified_genre', 
                      'clean_category', 'top', 'country', '2_week', '3_week', '5_week', '6_week', '10_week',
                      '15_week', 'General','Flag','Delete','imputed', 'Delete_bo','Delete_br','Delete_de',
                      'Delete_usa']]

only_vars.head(5)

## Subset description of numeric and categorical variables

In [None]:
# Describe length of rows and columns
only_vars.shape

In [None]:
# Describe numerical variables
only_vars.describe()

In [None]:
# Describe categorical variables
only_vars.describe(include=['object','datetime', 'bool'])

In [None]:
# Nan or missing values for subset
missing_data = only_vars.isnull()
total = missing_data.sum().sort_values(ascending=False)
percent = (missing_data.sum()/missing_data.count()).sort_values(ascending=False)
summary_missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
summary_missing_data['Missing Percent'] = summary_missing_data['Missing Percent'].apply(lambda x: x * 100)
summary_missing_data

### Functions

#### To add weight column

In [None]:
def generate_weight_dict(df: pd.DataFrame) -> dict:
    weight = df['id'].to_list()
    w_dict = {}

    for w in weight:
        if w in w_dict:
            w_dict[w] += 1
        else:
            w_dict[w] = 1

    return w_dict

def generate_weight_column(df: pd.DataFrame, parameter: str) -> pd.DataFrame:
    w_dict = generate_weight_dict(df)
    w_name = f'weight_{parameter}'
    amount_name = f'amount_apps_{parameter}'
    df[w_name] = df['id'].apply(lambda idx: 1/w_dict[idx])
    df[amount_name] = df['id'].apply(lambda idx: w_dict[idx])
    
    return df

#### To plot categorical variables

In [None]:
def generate_dict(values_list: list) -> dict:
    final_dict = {}
    for value in values_list:
        if value in final_dict:
            final_dict[value] += 1
        else:
            final_dict[value] = 1
    
    return final_dict

def generate_graph(values_dict: dict, feature: str):
    plt.rcdefaults()
    fig, ax = plt.subplots()

    values_dict = OrderedDict(sorted(values_dict.items(), key=lambda x: x[1], reverse=True))

    values_dict_keys = values_dict.keys()
    y_pos = np.arange(len(values_dict_keys))
    values_dict_values = values_dict.values()

    ax.barh(y_pos, values_dict_values, align='center', color=(0.2, 0.2, 0.2, 0.5))
    ax.set_yticks(y_pos)
    ax.set_yticklabels(values_dict_keys)
    ax.invert_yaxis() 
    ax.set_xlabel(feature.title())
    ax.set_title(f'{feature.title()} Distribution')
    
def generate_pie_chart(variable_dict: dict, feature: str):
    fig1, ax1 = plt.subplots()
    patches, texts, autotexts = ax1.pie(variable_dict.values(), labels=variable_dict.keys(), autopct='%1.1f%%',
            shadow=True, startangle=90, colors=['#bebebe','#606060'])
    for autotext in autotexts:
        autotext.set_color('black')
    #draw circle
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    # Equal aspect ratio ensures that pie is drawn as a circle

    plt.tight_layout()    
    ax1.axis('equal')
    ax1.set_title(f'{feature.title()} Distribution')
    plt.show()
    
def generate_horizontal_bar_chart(list_dicts: List[dict], list_labels: list, variable: str):
    fig = go.Figure()
    buttons = []

    for idx, (dict_num, label) in enumerate(zip(list_dicts, list_labels)):
        dict_current = OrderedDict(sorted(dict_num.items(), key=lambda x: x[1], reverse=True))
        visible = [False]*len(list_labels)
        visible[idx]=True

        fig.add_trace(
            go.Bar(
                x=list(dict_current.values()),
                y=list(dict_current.keys()),
                name=str(label),
                orientation='h'
            )
        )

        buttons.append(dict(
            args=[{"visible": visible}],
            label=str(label),
            method='update'
        ))

    buttons.append(dict(
        args=[{"visible": [True]*len(list_labels)}],
        label='All',
        method='update'
    ))

    # Add dropdown
    fig.update_layout(
        title=variable,
        updatemenus=[
            dict(
                buttons=list(buttons),
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    fig.show()
    
def generate_horizontal_bar_chart_dropdown_2(list_dicts: List[dict], list_labels: list, variable: str):
    fig = go.Figure()
    buttons = []
    buttons_2 = []

    for idx, (dict_num, label) in enumerate(zip(list_dicts, list_labels)):
        dict_current = OrderedDict(sorted(dict_num.items(), key=lambda x: x[1], reverse=True))
        visible = [False]*len(list_labels)
        visible[idx]=True

        fig.add_trace(
            go.Bar(
                x=list(dict_current.values()),
                y=list(dict_current.keys()),
                name=str(label),
                orientation='h'
            )
        )

        buttons.append(dict(
            args=[{"visible": visible}],
            label=str(label),
            method='update'
        ))

    buttons.append(dict(
        args=[{"visible": [True]*len(list_labels)}],
        label='All',
        method='update'
    ))

    # Add dropdown
    fig.update_layout(
        title=variable,
        updatemenus=[
            dict(
                buttons=list(buttons),
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    fig.show()
    
    
def generate_pie_chart_dropdown(list_dicts: List[dict], list_labels: list, variable: str):
    fig = go.Figure()
    buttons = []

    for idx, (dict_num, label) in enumerate(zip(list_dicts, list_labels)):
        dict_current = OrderedDict(sorted(dict_num.items(), key=lambda x: x[1], reverse=True))
        visible = [False]*len(list_labels)
        visible[idx]=True

        fig.add_trace(
            go.Pie(
                values=list(dict_current.values()),
                labels=list(dict_current.keys()),
                name=label
            )
        )

        buttons.append(dict(
            args=[{"visible": visible}],
            label=label,
            method='update'
        ))

    # Add dropdown
    fig.update_layout(
        title=variable,
        updatemenus=[
            dict(
                buttons=list(buttons),
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    plotly.offline.plot(fig, filename= f'{variable}-rq2-locf.html', auto_open=False)
    fig.show()

#### To get numerical variables stats

In [None]:
def get_stats(variable: str, data: dict, df_big: pd.DataFrame, weight_name: str, group: str):
    df_variable = df_big[[variable, weight_name]]
    df_variable = df_variable.dropna()
    w_mean = (df_variable[variable]*df_variable[weight_name]).sum()/df_variable[weight_name].sum() if df_variable[weight_name].sum() > 0 else None
    mean = df_variable[variable].mean()
    min = df_variable[variable].min()
    max = df_variable[variable].max()
    median = df_variable[variable].median()
    std = df_variable[variable].std()
    q1 = df_variable[variable].quantile(q=0.25)
    q3 = df_variable[variable].quantile(q=0.75)

    data['variable'].append(variable)
    data['group'].append(group)
    data['weighted_mean'].append(w_mean)
    data['mean'].append(mean)
    data['min'].append(min)
    data['q1'].append(q1)
    data['median'].append(median)
    data['q3'].append(q3)
    data['max'].append(max)
    data['std'].append(std)

#### To plot numerical variables

In [None]:
def plot_violin(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    df = df_big[[variable, x_axis]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    if hue_var != '':
        sns.violinplot(x=x_axis, y=variable, hue=hue_var, data=df)
    else:
        sns.violinplot(x=x_axis, y=variable, data=df)
        
def plot_boxplot(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
    else:
        df = df_big[[variable, x_axis]]

    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    
    if hue_var != '':
        sns.boxplot(x=x_axis, y=variable, hue=hue_var, data=df, orient="v")
    else:
        sns.boxplot(x=x_axis, y=variable, data=df, orient="v")
        
def plot_boxplot_horizontal(df_big: pd.DataFrame, variable: str, y_axis: str, score: float, hue_var=''):
    if hue_var != '':
        df = df_big[[variable, variable, hue_var]]
    else:
        df = df_big[[variable, variable]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    
    if hue_var != '':
        sns.boxplot(x=variable, y=y_axis, hue=hue_var, data=df, orient="h")
    else:
        sns.boxplot(x=variable, y=y_axis, data=df, orient="h")
        
def plot_boxplot_single(df_big: pd.DataFrame, variable: str, score: float):
    df = df_big[[variable]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    
    sns.boxplot(y=variable, data=df) 

#### To plot numerical variables in gray palette

In [None]:
def plot_boxplot_gray(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
    else:
        df = df_big[[variable, x_axis]]
    
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    if hue_var != '':
        ax = sns.boxplot(x=x_axis, y=variable, hue=hue_var, data=df, orient="v", palette=seq_col_brew[1:])
    else:
        ax = sns.boxplot(x=x_axis, y=variable, data=df, orient="v", palette=seq_col_brew[1:])
        
from matplotlib.patches import PathPatch

def adjust_box_widths(g, fac):
    """
    Adjust the withs of a seaborn-generated boxplot.
    """

    # iterating through Axes instances
    for ax in g.axes:

        # iterating through axes artists:
        for c in ax.get_children():

            # searching for PathPatches
            if isinstance(c, PathPatch):
                # getting current width of box:
                p = c.get_path()
                verts = p.vertices
                verts_sub = verts[:-1]
                xmin = np.min(verts_sub[:, 1])
                xmax = np.max(verts_sub[:, 1])
                xmid = 0.5*(xmin+xmax)
                xhalf = 0.5*(xmax - xmin)

                # setting new width of box
                xmin_new = xmid-fac*xhalf
                xmax_new = xmid+fac*xhalf
                verts_sub[verts_sub[:, 1] == xmin, 1] = xmin_new
                verts_sub[verts_sub[:, 1] == xmax, 1] = xmax_new

                # setting new width of median line
                for l in ax.lines:
                    if np.all(l.get_xdata() == [xmin, xmax]):
                        l.set_xdata([xmin_new, xmax_new])
        
        
def plot_boxplot_horizontal_gray(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
    else:
        df = df_big[[variable, x_axis]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    fig = plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    if hue_var != '':
        ax = sns.boxplot(x=variable, y=x_axis, hue=hue_var, data=df, orient="h", palette=seq_col_brew[1:], width=0.4)
        adjust_box_widths(fig, 0.8)
    else:
        ax = sns.boxplot(x=variable, y=x_axis, data=df, orient="h", palette=seq_col_brew[1:], width=0.2)
    
    size=15
    params = {'legend.fontsize': 'large',
          'figure.figsize': (20,8),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.95,
          'ytick.labelsize': size*0.95,
          'axes.titlepad': 25}
    plt.rcParams.update(params)
        
def plot_boxplot_single_gray(df_big: pd.DataFrame, variable: str, score: float):
    df = df_big[[variable]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    ax = sns.boxplot(x=variable, data=df, palette=seq_col_brew[1:], width=0.5)
    for box in ax.artists:
        box.set_facecolor("white")

#### To generate possible pairs

In [None]:
def generate_pairs(source: list) -> list:
    result = []
    for p1 in range(len(source)):
        for p2 in range(p1+1,len(source)):
            result.append([source[p1],source[p2]])
    return result

#### To generate pvalues and cliffs delta

In [None]:
def generate_pval_dataframe(list_keys: list, df_big: pd.DataFrame, value: str, num_variable: str) -> pd.DataFrame:
    df = pd.DataFrame(columns=['c1', 'c2', 'pvalue', 'd', 'size'])
    list_pairs = generate_pairs(list_keys)
    
    for pair in list_pairs:
        c1 = pair[0]
        c2 = pair[1]
        
        x = df_big.loc[df_big[value] == c1, num_variable].to_list()
        y = df_big.loc[df_big[value] == c2, num_variable].to_list()
        
        Hmw, pval = mannwhitneyu(x, y, alternative='two-sided')
        d, size = cliffsDelta(x, y)
        
        df = df.append(
            {'c1': c1, 
             'c2': c2, 
             'pvalue': pval, 
             'd': d, 
             'size': size}, ignore_index=True)
        
    reject, pval_corrected, a_s, a_b = multipletests(df['pvalue'].to_list(), alpha=0.05, method='holm', is_sorted=False, returnsorted=False)
    df['reject'] = reject
    df['pvalue_corrected'] = pval_corrected
    
    return df    

def generate_pval_dataframe_two(list_keys: list, df_big: pd.DataFrame, value_1: str, value_2:str, num_variable: str) -> pd.DataFrame:
    df = pd.DataFrame(columns=['c1', 'c2', 'pvalue', 'd', 'size'])
    list_pairs = generate_pairs(list_keys)
    
    for pair in list_pairs:
        c1 = pair[0]
        c1_1 = c1[0]
        c1_2 = c1[1]
        
        c2 = pair[1]
        c2_1 = c2[0]
        c2_2 = c2[1]
        
        x = df_big.loc[(df_big[value_1] == c1_1) & (df_big[value_2] == c1_2), num_variable].to_list()
        y = df_big.loc[(df_big[value_1] == c2_1) & (df_big[value_2] == c2_2), num_variable].to_list()
        
        Hmw, pval = mannwhitneyu(x, y, alternative='two-sided')
        d, size = cliffsDelta(x, y)
        
        df = df.append(
            {'c1': c1, 
             'c2': c2, 
             'pvalue': pval, 
             'd': d, 
             'size': size}, ignore_index=True)
        
    reject, pval_corrected, a_s, a_b = multipletests(df['pvalue'].to_list(), alpha=0.05, method='holm', is_sorted=False, returnsorted=False)
    df['reject'] = reject
    df['pvalue_corrected'] = pval_corrected
    
    return df    

# Analysis

In [None]:
# Rename columns
only_vars.columns = ['id','len_name','len_summary', 'len_description', 'last_update_date', 
                     'content_rating', 'rating', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5',
                     'num_installs', 'num_installs_num', 'has_whats_new', 'last_update_days', 'android_version',
                     'num_week', 'price', 'price_usd', 'genre', 'category', 'top', 'country','2_week', '3_week', 
                     '5_week', '6_week', '10_week', '15_week', 'General','Flag','Delete','imputed', 'Delete_bo',
                     'Delete_br','Delete_de','Delete_usa']

only_vars.head(3)

## General

In [None]:
# Save copy of only vars 
only_vars_original = only_vars.copy()

In [None]:
# Add weight column based on app id
only_vars = generate_weight_column(only_vars, 'general')
only_vars.head(3)

## Categorical variables

### Content Rating

In [None]:
content_rating = only_vars['content_rating']
content_rating_dict = generate_dict(content_rating.to_list())
pp.pprint(content_rating_dict)
generate_graph(content_rating_dict, 'content rating')

### Android Version

In [None]:
macro_android_version = only_vars['android_version']
macro_android_version_dict = generate_dict(macro_android_version.to_list())
pp.pprint(macro_android_version_dict)
generate_graph(macro_android_version_dict, 'android version')

### Genre

In [None]:
genre = only_vars['genre']
genre_dict = generate_dict(genre.to_list())
pp.pprint(genre_dict)
generate_graph(genre_dict, 'genre')

### Has What's New

In [None]:
has_whats_new = only_vars['has_whats_new']
has_whats_new_dict = generate_dict(has_whats_new.to_list())
pp.pprint(has_whats_new_dict)
generate_pie_chart(has_whats_new_dict, "has whats new")

### Num Installs

In [None]:
fixed_num_installs = only_vars['num_installs']
fixed_num_installs_dict = generate_dict(fixed_num_installs.to_list())
pp.pprint(fixed_num_installs_dict)
generate_graph(fixed_num_installs_dict, 'num installs')

## Numerical variables

### Numerical variables stats

In [None]:
numeric_variables = ['len_name', 'len_summary', 'len_description', 
                     'rating', 'rating_1','rating_2', 'rating_3', 'rating_4', 'rating_5', 
                     'last_update_days', 'price_usd','num_installs_num']

In [None]:
data_general = {
    'variable': [],
    'group':[],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

for nv in numeric_variables:
    get_stats(nv, data_general, only_vars, 'weight_general', 'general')

In [None]:
pd.options.display.float_format = "{:.2f}".format
numeric_general = pd.DataFrame(data_general)
numeric_general

In [None]:
for nv in numeric_variables:
    plot_boxplot_single_gray(only_vars, nv, 2)

## Grouped by week

### Categorical Variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_week = only_vars.groupby(['num_week'])

data_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_week = []
android_version_dicts_week = []
has_whats_new_dicts_week = []
genre_dicts_week = []
week_labels = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_week.groups.keys():
    df = groups_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    week_labels.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_week, week_labels, 'Content Rating')

In [None]:
content_rating_week = pd.DataFrame()
for label, d in zip(week_labels, content_rating_dicts_week):
    d['week'] = label
    content_rating_week = content_rating_week.append(d, ignore_index=True)
    
content_rating_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_week, week_labels, 'Android Versions')

In [None]:
android_version_week = pd.DataFrame()
for label, d in zip(week_labels, android_version_dicts_week):
    d['week'] = label
    android_version_week = android_version_week.append(d, ignore_index=True)
    
android_version_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_week, week_labels, 'Genre')

In [None]:
genre_week = pd.DataFrame()
for label, d in zip(week_labels, genre_dicts_week):
    d['week'] = label
    genre_week = genre_week.append(d, ignore_index=True)
    
genre_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_week, week_labels, 'Has Whats New')

In [None]:
has_whats_new_week = pd.DataFrame()
for label, d in zip(week_labels, has_whats_new_dicts_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_week = has_whats_new_week.append(d, ignore_index=True)
    
has_whats_new_week

### Numerical variables

In [None]:
numeric_data_week = pd.DataFrame(data_week)
numeric_data_week =numeric_data_week.sort_values(by=['variable', 'group'])

numeric_data_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_name_week_df = week_pvalues
len_name_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_summary_week_df = week_pvalues
len_summary_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_description_week_df = week_pvalues
len_description_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_rating_week_df = week_pvalues
len_rating_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_rating_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_rating_1_week_df = week_pvalues
len_rating_1_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_rating_1_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_rating_2_week_df = week_pvalues
len_rating_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_rating_2_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_rating_3_week_df = week_pvalues
len_rating_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_rating_3_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_rating_4_week_df = week_pvalues
len_rating_4_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_rating_4_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_rating_5_week_df = week_pvalues
len_rating_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_rating_5_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_last_update_days_week_df = week_pvalues
len_last_update_days_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_last_update_days_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_price_usd_week_df = week_pvalues
len_price_usd_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_price_usd_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, 'num_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_week.groups.keys()), only_vars, 'num_week', var)
week_pvalues

In [None]:
len_num_installs_num_week_df = week_pvalues
len_num_installs_num_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_num_installs_num_week_dict)

## Grouped by two weeks

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_2_week = only_vars.groupby(['2_week'])

data_2_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_2_week = []
android_version_dicts_2_week = []
has_whats_new_dicts_2_week = []
genre_dicts_2_week = []
labels_2_week = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_2_week.groups.keys():
    df = groups_2_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    labels_2_week.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_2_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_2_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_2_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_2_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_2_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_2_week, labels_2_week, 'Content Rating')

In [None]:
content_rating_2_week = pd.DataFrame()
for label, d in zip(labels_2_week, content_rating_dicts_2_week):
    d['2_week'] = label
    content_rating_2_week = content_rating_2_week.append(d, ignore_index=True)
    
content_rating_2_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_2_week, labels_2_week, 'Android Version')

In [None]:
android_version_2_week = pd.DataFrame()
for label, d in zip(labels_2_week, android_version_dicts_2_week):
    d['2_week'] = label
    android_version_2_week = android_version_2_week.append(d, ignore_index=True)
    
android_version_2_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_2_week, labels_2_week, 'Genre')

In [None]:
genre_2_week = pd.DataFrame()
for label, d in zip(labels_2_week, genre_dicts_2_week):
    d['2_week'] = label
    genre_2_week = genre_2_week.append(d, ignore_index=True)
    
genre_2_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_2_week, labels_2_week, 'Has Whats New')

In [None]:
has_whats_new_2_week = pd.DataFrame()
for label, d in zip(labels_2_week, has_whats_new_dicts_2_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_2_week = has_whats_new_2_week.append(d, ignore_index=True)
    
has_whats_new_2_week

### Numerical Variables

In [None]:
numeric_data_2_week = pd.DataFrame(data_2_week)
numeric_data_2_week =numeric_data_2_week.sort_values(by=['variable', 'group'])

numeric_data_2_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
len_name_2_week_df = week_pvalues
len_name_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_2_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
len_summary_2_week_df = week_pvalues
len_summary_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_2_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
len_description_2_week_df = week_pvalues
len_description_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_2_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
rating_2_week_df = week_pvalues
rating_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
rating_1_2_week_df = week_pvalues
rating_1_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_1_2_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
rating_2_2_week_df = week_pvalues
rating_2_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_2_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
rating_3_2_week_df = week_pvalues
rating_3_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_2_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
rating_4_2_week_df = week_pvalues
rating_4_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_4_2_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
rating_5_2_week_df = week_pvalues
rating_5_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_2_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
last_update_days_2_week_df = week_pvalues
last_update_days_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(last_update_days_2_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
price_usd_2_week_df = week_pvalues
price_usd_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(price_usd_2_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, '2_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_2_week.groups.keys()), only_vars, '2_week', var)
week_pvalues

In [None]:
num_installs_num_2_week_df = week_pvalues
num_installs_num_2_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(num_installs_num_2_week_dict)

## Grouped by 3 weeks

### Categorical Variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_3_week = only_vars.groupby(['3_week'])

data_3_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_3_week = []
android_version_dicts_3_week = []
has_whats_new_dicts_3_week = []
genre_dicts_3_week = []
labels_3_week = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_3_week.groups.keys():
    df = groups_3_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    labels_3_week.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_3_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_3_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_3_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_3_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_3_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_3_week, labels_3_week, 'Content Rating')

In [None]:
content_rating_3_week = pd.DataFrame()
for label, d in zip(labels_3_week, content_rating_dicts_3_week):
    d['3_week'] = label
    content_rating_3_week = content_rating_3_week.append(d, ignore_index=True)
    
content_rating_3_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_3_week, labels_3_week, 'Android Version')

In [None]:
android_version_3_week = pd.DataFrame()
for label, d in zip(labels_3_week, android_version_dicts_3_week):
    d['3_week'] = label
    android_version_3_week = android_version_3_week.append(d, ignore_index=True)
    
android_version_3_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_3_week, labels_3_week, 'Genre')

In [None]:
genre_3_week = pd.DataFrame()
for label, d in zip(labels_3_week, genre_dicts_3_week):
    d['3_week'] = label
    genre_3_week = genre_3_week.append(d, ignore_index=True)
    
genre_3_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_3_week, labels_3_week, 'Has Whats New')

In [None]:
has_whats_new_3_week = pd.DataFrame()
for label, d in zip(labels_3_week, has_whats_new_dicts_3_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_3_week = has_whats_new_3_week.append(d, ignore_index=True)
    
has_whats_new_3_week

### Numerical Variables

In [None]:
numeric_data_3_week = pd.DataFrame(data_3_week)
numeric_data_3_week =numeric_data_3_week.sort_values(by=['variable', 'group'])

numeric_data_3_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
len_name_3_week_df = week_pvalues
len_name_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_3_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
len_summary_3_week_df = week_pvalues
len_summary_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_3_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
len_description_3_week_df = week_pvalues
len_description_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_3_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
rating_3_week_df = week_pvalues
rating_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
rating_1_3_week_df = week_pvalues
rating_1_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_1_3_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
rating_2_3_week_df = week_pvalues
rating_2_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_3_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
rating_3_3_week_df = week_pvalues
rating_3_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_3_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
rating_4_3_week_df = week_pvalues
rating_4_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_4_3_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
rating_5_3_week_df = week_pvalues
rating_5_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_3_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
last_update_days_3_week_df = week_pvalues
last_update_days_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(last_update_days_3_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
price_usd_3_week_df = week_pvalues
price_usd_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(price_usd_3_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, '3_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_3_week.groups.keys()), only_vars, '3_week', var)
week_pvalues

In [None]:
num_installs_num_3_week_df = week_pvalues
num_installs_num_3_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(num_installs_num_3_week_dict)

## Grouped by five weeks

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_5_week = only_vars.groupby(['5_week'])

data_5_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_5_week = []
android_version_dicts_5_week = []
has_whats_new_dicts_5_week = []
genre_dicts_5_week = []
labels_5_week = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_5_week.groups.keys():
    df = groups_5_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    labels_5_week.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_5_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_5_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_5_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_5_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_5_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_5_week, labels_5_week, 'Content Rating')

In [None]:
content_rating_5_week = pd.DataFrame()
for label, d in zip(labels_5_week, content_rating_dicts_5_week):
    d['5_week'] = label
    content_rating_5_week = content_rating_5_week.append(d, ignore_index=True)
    
content_rating_5_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_5_week, labels_5_week, 'Android Version')

In [None]:
android_version_5_week = pd.DataFrame()
for label, d in zip(labels_5_week, android_version_dicts_5_week):
    d['5_week'] = label
    android_version_5_week = android_version_5_week.append(d, ignore_index=True)
    
android_version_5_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_5_week, labels_5_week, 'Genre')

In [None]:
genre_5_week = pd.DataFrame()
for label, d in zip(labels_5_week, genre_dicts_5_week):
    d['5_week'] = label
    genre_5_week = genre_5_week.append(d, ignore_index=True)
    
genre_5_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_5_week, labels_5_week, 'Has Whats New')

In [None]:
has_whats_new_5_week = pd.DataFrame()
for label, d in zip(labels_5_week, has_whats_new_dicts_5_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_5_week = has_whats_new_5_week.append(d, ignore_index=True)
    
has_whats_new_5_week

### Numerical Variables

In [None]:
numeric_data_5_week = pd.DataFrame(data_5_week)
numeric_data_5_week =numeric_data_5_week.sort_values(by=['variable', 'group'])

numeric_data_5_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
len_name_5_week_df = week_pvalues
len_name_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_5_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
len_summary_5_week_df = week_pvalues
len_summary_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_5_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
len_description_5_week_df = week_pvalues
len_description_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_5_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
rating_5_week_df = week_pvalues
rating_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
rating_1_5_week_df = week_pvalues
rating_1_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_1_5_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
rating_2_5_week_df = week_pvalues
rating_2_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_5_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
rating_3_5_week_df = week_pvalues
rating_3_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_5_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
rating_4_5_week_df = week_pvalues
rating_4_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_4_5_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
rating_5_5_week_df = week_pvalues
rating_5_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_5_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
last_update_days_5_week_df = week_pvalues
last_update_days_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(last_update_days_5_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
price_usd_5_week_df = week_pvalues
price_usd_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(price_usd_5_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, '5_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_5_week.groups.keys()), only_vars, '5_week', var)
week_pvalues

In [None]:
num_installs_num_5_week_df = week_pvalues
num_installs_num_5_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(num_installs_num_5_week_dict)

## Grouped by six weeks

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_6_week = only_vars.groupby(['6_week'])

data_6_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_6_week = []
android_version_dicts_6_week = []
has_whats_new_dicts_6_week = []
genre_dicts_6_week = []
labels_6_week = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_6_week.groups.keys():
    df = groups_6_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    labels_6_week.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_6_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_6_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_6_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_6_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_6_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_6_week, labels_6_week, 'Content Rating')

In [None]:
content_rating_6_week = pd.DataFrame()
for label, d in zip(labels_6_week, content_rating_dicts_6_week):
    d['6_week'] = label
    content_rating_6_week = content_rating_6_week.append(d, ignore_index=True)
    
content_rating_6_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_6_week, labels_6_week, 'Android Version')

In [None]:
android_version_6_week = pd.DataFrame()
for label, d in zip(labels_6_week, android_version_dicts_6_week):
    d['6_week'] = label
    android_version_6_week = android_version_6_week.append(d, ignore_index=True)
    
android_version_6_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_6_week, labels_6_week, 'Genre')

In [None]:
genre_6_week = pd.DataFrame()
for label, d in zip(labels_6_week, genre_dicts_6_week):
    d['6_week'] = label
    genre_6_week = genre_6_week.append(d, ignore_index=True)
    
genre_6_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_6_week, labels_6_week, 'Has Whats New')

In [None]:
has_whats_new_6_week = pd.DataFrame()
for label, d in zip(labels_6_week, has_whats_new_dicts_6_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_6_week = has_whats_new_6_week.append(d, ignore_index=True)
    
has_whats_new_6_week

### Numerical Variables

In [None]:
numeric_data_6_week = pd.DataFrame(data_6_week)
numeric_data_6_week =numeric_data_6_week.sort_values(by=['variable', 'group'])

numeric_data_6_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
len_name_6_week_df = week_pvalues
len_name_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_6_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
len_summary_6_week_df = week_pvalues
len_summary_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_6_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
len_description_6_week_df = week_pvalues
len_description_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_6_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
rating_6_week_df = week_pvalues
rating_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_6_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
rating_1_6_week_df = week_pvalues
rating_1_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_1_6_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
rating_2_6_week_df = week_pvalues
rating_2_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_6_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
rating_3_6_week_df = week_pvalues
rating_3_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_6_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
rating_4_6_week_df = week_pvalues
rating_4_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_4_6_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
rating_5_6_week_df = week_pvalues
rating_5_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_6_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
last_update_days_6_week_df = week_pvalues
last_update_days_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(last_update_days_6_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
price_usd_6_week_df = week_pvalues
price_usd_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(price_usd_6_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, '6_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_6_week.groups.keys()), only_vars, '6_week', var)
week_pvalues

In [None]:
num_installs_num_6_week_df = week_pvalues
num_installs_num_6_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(num_installs_num_6_week_dict)

## Grouped by ten weeks

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_10_week = only_vars.groupby(['10_week'])

data_10_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_10_week = []
android_version_dicts_10_week = []
has_whats_new_dicts_10_week = []
genre_dicts_10_week = []
labels_10_week = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_10_week.groups.keys():
    df = groups_10_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    labels_10_week.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_10_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_10_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_10_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_10_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_10_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_10_week, labels_10_week, 'Content Rating')

In [None]:
content_rating_10_week = pd.DataFrame()
for label, d in zip(labels_10_week, content_rating_dicts_10_week):
    d['10_week'] = label
    content_rating_10_week = content_rating_10_week.append(d, ignore_index=True)
    
content_rating_10_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_10_week, labels_10_week, 'Android Version')

In [None]:
android_version_10_week = pd.DataFrame()
for label, d in zip(labels_10_week, android_version_dicts_10_week):
    d['10_week'] = label
    android_version_10_week = android_version_10_week.append(d, ignore_index=True)
    
android_version_10_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_10_week, labels_10_week, 'Genre')

In [None]:
genre_10_week = pd.DataFrame()
for label, d in zip(labels_10_week, genre_dicts_10_week):
    d['10_week'] = label
    genre_10_week = genre_10_week.append(d, ignore_index=True)
    
genre_10_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_10_week, labels_10_week, 'Has Whats New')

In [None]:
has_whats_new_10_week = pd.DataFrame()
for label, d in zip(labels_10_week, has_whats_new_dicts_10_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_10_week = has_whats_new_10_week.append(d, ignore_index=True)
    
has_whats_new_10_week

### Numerical Variables

In [None]:
numeric_data_10_week = pd.DataFrame(data_10_week)
numeric_data_10_week =numeric_data_10_week.sort_values(by=['variable', 'group'])

numeric_data_10_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
len_name_10_week_df = week_pvalues
len_name_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_10_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
len_summary_10_week_df = week_pvalues
len_summary_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_10_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
len_description_10_week_df = week_pvalues
len_description_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_10_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
rating_10_week_df = week_pvalues
rating_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_10_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
rating_1_10_week_df = week_pvalues
rating_1_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_1_10_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
rating_2_10_week_df = week_pvalues
rating_2_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_10_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
rating_3_10_week_df = week_pvalues
rating_3_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_10_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
rating_4_10_week_df = week_pvalues
rating_4_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_4_10_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
rating_5_10_week_df = week_pvalues
rating_5_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_10_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
last_update_days_10_week_df = week_pvalues
last_update_days_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(last_update_days_10_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
price_usd_10_week_df = week_pvalues
price_usd_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(price_usd_10_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, '10_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_10_week.groups.keys()), only_vars, '10_week', var)
week_pvalues

In [None]:
num_installs_num_10_week_df = week_pvalues
num_installs_num_10_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(num_installs_num_10_week_dict)

## Grouped by fifteen weeks

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_15_week = only_vars.groupby(['15_week'])

data_15_week = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': [],
    'std': []
}

content_rating_dicts_15_week = []
android_version_dicts_15_week = []
has_whats_new_dicts_15_week = []
genre_dicts_15_week = []
labels_15_week = []
    
# Iterate over week groups to plot variables and get stats
for key in groups_15_week.groups.keys():
    df = groups_15_week.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    labels_15_week.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts_15_week.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts_15_week.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts_15_week.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts_15_week.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_15_week, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts_15_week, labels_15_week, 'Content Rating')

In [None]:
content_rating_15_week = pd.DataFrame()
for label, d in zip(labels_15_week, content_rating_dicts_15_week):
    d['15_week'] = label
    content_rating_15_week = content_rating_15_week.append(d, ignore_index=True)
    
content_rating_15_week

In [None]:
generate_horizontal_bar_chart(android_version_dicts_15_week, labels_15_week, 'Android Version')

In [None]:
android_version_15_week = pd.DataFrame()
for label, d in zip(labels_15_week, android_version_dicts_15_week):
    d['15_week'] = label
    android_version_15_week = android_version_15_week.append(d, ignore_index=True)
    
android_version_15_week

In [None]:
generate_horizontal_bar_chart(genre_dicts_15_week, labels_15_week, 'Genre')

In [None]:
genre_15_week = pd.DataFrame()
for label, d in zip(labels_15_week, genre_dicts_15_week):
    d['15_week'] = label
    genre_15_week = genre_15_week.append(d, ignore_index=True)
    
genre_15_week

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts_15_week, labels_15_week, 'Has Whats New')

In [None]:
has_whats_new_15_week = pd.DataFrame()
for label, d in zip(labels_15_week, has_whats_new_dicts_15_week):
    d['week'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_15_week = has_whats_new_15_week.append(d, ignore_index=True)
    
has_whats_new_15_week

### Numerical Variables

In [None]:
numeric_data_15_week = pd.DataFrame(data_15_week)
numeric_data_15_week =numeric_data_15_week.sort_values(by=['variable', 'group'])

numeric_data_15_week

In [None]:
var = 'len_name'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
len_name_15_week_df = week_pvalues
len_name_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_name_15_week_dict)

In [None]:
var = 'len_summary'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
len_summary_15_week_df = week_pvalues
len_summary_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_summary_15_week_dict)

In [None]:
var = 'len_description'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
len_description_15_week_df = week_pvalues
len_description_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(len_description_15_week_dict)

In [None]:
var = 'rating'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
rating_15_week_df = week_pvalues
rating_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_15_week_dict)

In [None]:
var = 'rating_1'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
rating_1_15_week_df = week_pvalues
rating_1_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_1_15_week_dict)

In [None]:
var = 'rating_2'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
rating_2_15_week_df = week_pvalues
rating_2_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_2_15_week_dict)

In [None]:
var = 'rating_3'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
rating_3_15_week_df = week_pvalues
rating_3_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_3_15_week_dict)

In [None]:
var = 'rating_4'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
rating_4_15_week_df = week_pvalues
rating_4_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_4_15_week_dict)

In [None]:
var = 'rating_5'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
rating_5_15_week_df = week_pvalues
rating_5_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(rating_5_15_week_dict)

In [None]:
var = 'last_update_days'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
last_update_days_15_week_df = week_pvalues
last_update_days_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(last_update_days_15_week_dict)

In [None]:
var = 'price_usd'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
price_usd_15_week_df = week_pvalues
price_usd_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(price_usd_15_week_dict)

In [None]:
var = 'num_installs_num'
plot_boxplot_horizontal_gray(only_vars, var, '15_week', 2)
week_pvalues = generate_pval_dataframe(list(groups_15_week.groups.keys()), only_vars, '15_week', var)
week_pvalues

In [None]:
num_installs_num_15_week_df = week_pvalues
num_installs_num_15_week_dict = generate_dict(week_pvalues['size'].to_list())
pp.pprint(num_installs_num_15_week_dict)