# General EDA 

In [None]:
# Imports
%matplotlib inline
from ast import literal_eval
from collections import OrderedDict
from scipy.stats import zscore
from typing import List
import csv
import datetime
import math
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import numpy as np
import pandas as pd
import pymongo
import robustats
import seaborn as sns
import wquantiles as wq
import pprint
from scipy.stats import chi2, mstats, mannwhitneyu, kruskal, chi2_contingency
from statsmodels.stats.multitest import multipletests
from cliffsDelta import cliffsDelta
from matplotlib.patches import PathPatch
pp = pprint.PrettyPrinter(indent=4)

# Jupyter configurations for displaying more columns and rows
pd.options.display.max_columns = 20002
pd.options.display.max_columns = 10000
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

In [None]:
# Set palette
seq_col_brew = sns.color_palette("Greys_r", 5)
sns.set_palette(seq_col_brew)
seq_col_brew

In [None]:
# Data gathering You have to read the csv that correspond to the apps collection
all_apps = pd.read_csv(
   'all_apps.csv', delimiter='|', encoding='utf-8', engine= 'python',converters={'_id':str})

In [None]:
# Remove all not necessary columns - _Id from monodb and fake index
index_mongo = all_apps['_id']
del all_apps['_id']
del all_apps['Unnamed: 0']

In [None]:
all_apps.head(3)

In [None]:
# Original types
all_apps.dtypes

In [None]:
# Transform variables to date
vars_dates = ['retrieved_date_start','retrieved_date_end', 'last_update', 'last_update_fixed']
for var in vars_dates:
    all_apps[var] = pd.to_datetime(all_apps[var])

In [None]:
# Transformed types
all_apps.dtypes

# General description of numeric and categorical variables

In [None]:
# Describe length of rows and columns
all_apps.shape

In [None]:
# Describe numerical variables for all the dataset
all_apps.describe()

In [None]:
# Describe categorical variables
all_apps.describe(include=['object','datetime', 'bool'])

In [None]:
# Nan or missing values for all dataset
missing_data = all_apps.isnull()
total = missing_data.sum().sort_values(ascending=False)
percent = (missing_data.sum()/missing_data.count()).sort_values(ascending=False)
summary_missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
summary_missing_data['Missing Percent'] = summary_missing_data['Missing Percent'].apply(lambda x: x * 100)
summary_missing_data

## Different types of values per categorical variables

In [None]:
def get_number_values_categorical_values(df, column_name:str, high:List[str],low:List[str]):
    print("-/-"*5)
    print(f"Column: {column_name}")
    
    try:
        column = df[column_name]
        number_values = column.nunique()

    except:
        #This catch is useful when genre is list type
        column = df[column_name].astype(str)
        number_values = column.nunique()
        
    print(f"Number of different values {number_values}")
    
    if number_values < 100 and "retrieved" not in column_name:
        print("\n")
        print("Values:")
        print(f"{column.unique()}")
        print("\n")
        
        low.append(column_name)
    else:
        high.append(column_name)
    print("-/-"*5)

high_cardinality = []
low_cardinality = []
columns_categorical = all_apps.select_dtypes(include=['object','datetime', 'bool']).columns

for column in columns_categorical:
    get_number_values_categorical_values(all_apps,column,high_cardinality,low_cardinality)

print(len(high_cardinality), len(low_cardinality))

# Subset of selected variables

In [None]:
# Length of name, summary and description
all_apps['len_name'] = all_apps['name'].str.len()
all_apps['len_summary'] = all_apps['summary'].str.len()
all_apps['len_description'] = all_apps['description'].str.len()

# Transform num installs from string to number
fixed_num = all_apps[['fixed_num_installs']]
fixed_num['fixed_num_installs_num'] = fixed_num['fixed_num_installs'].str.replace(',','')
fixed_num['fixed_num_installs_num'] = fixed_num['fixed_num_installs_num'].str.replace('+','')
fixed_num['fixed_num_installs_num'] = pd.to_numeric(fixed_num['fixed_num_installs_num'])

all_apps['fixed_num_installs_num'] = fixed_num['fixed_num_installs_num']

only_vars = all_apps[['id', 'len_name', 'len_summary', 'len_description', 
                      'last_update_fixed', 'content_rating', 'rating', 'rating_1','rating_2', 
                      'rating_3', 'rating_4', 'rating_5', 'fixed_num_installs', 
                      'fixed_num_installs_num', 'has_whats_new', 'last_update_days_fixed', 
                      'macro_android_version', 'num_week', 'price', 'price_usd', 'unified_genre', 
                      'clean_category', 'top', 'country']]

only_vars.head(5)

## Subset description of numeric and categorical variables

In [None]:
# Describe length of rows and columns
only_vars.shape

In [None]:
# Describe numerical variables
only_vars.describe()

In [None]:
# Describe categorical variables
only_vars.describe(include=['object','datetime', 'bool'])

In [None]:
# Nan or missing values for subset
missing_data = only_vars.isnull()
total = missing_data.sum().sort_values(ascending=False)
percent = (missing_data.sum()/missing_data.count()).sort_values(ascending=False)
summary_missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
summary_missing_data['Missing Percent'] = summary_missing_data['Missing Percent'].apply(lambda x: x * 100)
summary_missing_data

### Functions

#### Fo adding weighted column

In [None]:
def generate_weight_dict(df: pd.DataFrame) -> dict:
    weight = df['id'].to_list()
    w_dict = {}

    for w in weight:
        if w in w_dict:
            w_dict[w] += 1
        else:
            w_dict[w] = 1

    return w_dict

# Add weight column to compute weighted mean
def generate_weight_column(df: pd.DataFrame, parameter: str) -> pd.DataFrame:
    w_dict = generate_weight_dict(df)
    w_name = f'weight_{parameter}'
    amount_name = f'amount_apps_{parameter}'
    df[w_name] = df['id'].apply(lambda idx: 1/w_dict[idx])
    df[amount_name] = df['id'].apply(lambda idx: w_dict[idx])
    
    return df

#### Plotting categorical variables

In [None]:
# Generate dict for categorical values
def generate_dict(values_list: list) -> dict:
    final_dict = {}
    for value in values_list:
        if value in final_dict:
            final_dict[value] += 1
        else:
            final_dict[value] = 1
    
    return final_dict

def generate_graph(values_dict: dict, feature: str):
    plt.rcdefaults()
    fig, ax = plt.subplots()

    values_dict = OrderedDict(sorted(values_dict.items(), key=lambda x: x[1], reverse=True))

    values_dict_keys = values_dict.keys()
    y_pos = np.arange(len(values_dict_keys))
    values_dict_values = values_dict.values()

    ax.barh(y_pos, values_dict_values, align='center', color=(0.2, 0.2, 0.2, 0.5))
    ax.set_yticks(y_pos)
    ax.set_yticklabels(values_dict_keys)
    ax.invert_yaxis() 
    ax.set_xlabel(feature.title())
    ax.set_title(f'{feature.title()} Distribution')
    
def generate_pie_chart(variable_dict: dict, feature: str):
    fig1, ax1 = plt.subplots()
    patches, texts, autotexts = ax1.pie(variable_dict.values(), labels=variable_dict.keys(), autopct='%1.1f%%',
            shadow=True, startangle=90, colors=['#bebebe','#606060'])
    for autotext in autotexts:
        autotext.set_color('black')
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)

    plt.tight_layout()    
    ax1.axis('equal')
    ax1.set_title(f'{feature.title()} Distribution')
    plt.show()
    
def generate_horizontal_bar_chart(list_dicts: List[dict], list_labels: list, variable: str):
    fig = go.Figure()
    buttons = []
    buttons.append(dict(
        args=[{"visible": [True]*len(list_labels)}],
        label='All',
        method='update'
    ))

    for idx, (dict_num, label) in enumerate(zip(list_dicts, list_labels)):
        dict_current = OrderedDict(sorted(dict_num.items(), key=lambda x: x[1], reverse=True))
        visible = [False]*len(list_labels)
        visible[idx]=True

        fig.add_trace(
            go.Bar(
                x=list(dict_current.values()),
                y=list(dict_current.keys()),
                name=str(label),
                orientation='h'
            )
        )

        buttons.append(dict(
            args=[{"visible": visible}],
            label=str(label),
            method='update'
        ))

    

    # Add dropdown
    fig.update_layout(
        title=variable,
        updatemenus=[
            dict(
                buttons=list(buttons),
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )
    plotly.offline.plot(fig, filename= f'{variable}-rq1.html', auto_open=False)
    fig.show()
    
def generate_pie_chart_dropdown(list_dicts: List[dict], list_labels: list, variable: str):
    fig = go.Figure()
    buttons = []

    for idx, (dict_num, label) in enumerate(zip(list_dicts, list_labels)):
        dict_current = OrderedDict(sorted(dict_num.items(), key=lambda x: x[1], reverse=True))
        visible = [False]*len(list_labels)
        visible[idx]=True

        fig.add_trace(
            go.Pie(
                values=list(dict_current.values()),
                labels=list(dict_current.keys()),
                name=label
            )
        )

        buttons.append(dict(
            args=[{"visible": visible}],
            label=label,
            method='update'
        ))

    # Add dropdown
    fig.update_layout(
        title=variable,
        updatemenus=[
            dict(
                buttons=list(buttons),
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )
    
    plotly.offline.plot(fig, filename= f'{variable}-rq1.html', auto_open=False)
    fig.show()

#### To get numerical variables stats

In [None]:
def get_stats(variable: str, data: dict, df_big: pd.DataFrame, weight_name: str, group: str):
    df_variable = df_big[[variable, weight_name]]
    df_variable = df_variable.dropna()
    w_mean = (df_variable[variable]*df_variable[weight_name]).sum()/df_variable[weight_name].sum() if df_variable[weight_name].sum() > 0 else None
    mean = df_variable[variable].mean()
    min = df_variable[variable].min()
    max = df_variable[variable].max()
    median = df_variable[variable].median()
    q1 = df_variable[variable].quantile(q=0.25)
    q3 = df_variable[variable].quantile(q=0.75)

    data['variable'].append(variable)
    data['group'].append(group)
    data['weighted_mean'].append(w_mean)
    data['mean'].append(mean)
    data['min'].append(min)
    data['q1'].append(q1)
    data['median'].append(median)
    data['q3'].append(q3)
    data['max'].append(max)

#### To plot numerical variables

In [None]:
def plot_violin(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    df = df_big[[variable, x_axis]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    if hue_var != '':
        sns.violinplot(x=x_axis, y=variable, hue=hue_var, data=df)
    else:
        sns.violinplot(x=x_axis, y=variable, data=df)
        
def plot_boxplot(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
    else:
        df = df_big[[variable, x_axis]]

    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    
    if hue_var != '':
        sns.boxplot(x=x_axis, y=variable, hue=hue_var, data=df, orient="v")
    else:
        sns.boxplot(x=x_axis, y=variable, data=df, orient="v")
        
def plot_boxplot_horizontal(df_big: pd.DataFrame, variable: str, y_axis: str, score: float, hue_var=''):
    if hue_var != '':
        df = df_big[[variable, variable, hue_var]]
    else:
        df = df_big[[variable, variable]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    
    if hue_var != '':
        sns.boxplot(x=variable, y=y_axis, hue=hue_var, data=df, orient="h")
    else:
        sns.boxplot(x=variable, y=y_axis, data=df, orient="h")
        
def plot_boxplot_single(df_big: pd.DataFrame, variable: str, score: float):
    df = df_big[[variable]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure()
    
    sns.boxplot(y=variable, data=df) 

#### To plot numerical variables in gray palette

In [None]:
def plot_boxplot_gray(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
    else:
        df = df_big[[variable, x_axis]]
    
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    if hue_var != '':
        ax = sns.boxplot(x=x_axis, y=variable, hue=hue_var, data=df, orient="v", palette=seq_col_brew[1:])
    else:
        ax = sns.boxplot(x=x_axis, y=variable, data=df, orient="v", palette=seq_col_brew[1:])
        

def adjust_box_widths(g, fac):
    """
    Adjust the withs of a seaborn-generated boxplot.
    """

    # iterating through Axes instances
    for ax in g.axes:

        # iterating through axes artists:
        for c in ax.get_children():

            # searching for PathPatches
            if isinstance(c, PathPatch):
                # getting current width of box:
                p = c.get_path()
                verts = p.vertices
                verts_sub = verts[:-1]
                xmin = np.min(verts_sub[:, 1])
                xmax = np.max(verts_sub[:, 1])
                xmid = 0.5*(xmin+xmax)
                xhalf = 0.5*(xmax - xmin)

                # setting new width of box
                xmin_new = xmid-fac*xhalf
                xmax_new = xmid+fac*xhalf
                verts_sub[verts_sub[:, 1] == xmin, 1] = xmin_new
                verts_sub[verts_sub[:, 1] == xmax, 1] = xmax_new

                # setting new width of median line
                for l in ax.lines:
                    if np.all(l.get_xdata() == [xmin, xmax]):
                        l.set_xdata([xmin_new, xmax_new])
        
        
def plot_boxplot_horizontal_gray(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
    else:
        df = df_big[[variable, x_axis]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    fig = plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    if hue_var != '':
        ax = sns.boxplot(x=variable, y=x_axis, hue=hue_var, data=df, orient="h", palette=seq_col_brew[1:], width=0.4)
        adjust_box_widths(fig, 0.8)
    else:
        ax = sns.boxplot(x=variable, y=x_axis, data=df, orient="h", palette=seq_col_brew[1:], width=0.2)
    
    size=15
    params = {'legend.fontsize': 'large',
          'figure.figsize': (20,8),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.95,
          'ytick.labelsize': size*0.95,
          'axes.titlepad': 25}
    plt.rcParams.update(params)
    
def plot_boxplot_horizontal_gray_top(df_big: pd.DataFrame, variable: str, x_axis: str, score: float, hue_var=''):
    
    if hue_var != '':
        df = df_big[[variable, x_axis, hue_var]]
        df[hue_var]=df[hue_var].str.upper()
        df.columns = [variable, x_axis, hue_var.title()]
    else:
        df = df_big[[variable, x_axis]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    df[x_axis]=df[x_axis].str.replace('topSelling','Top selling')
    df[x_axis]=df[x_axis].str.replace('topFree','Top free')
    df[x_axis]=df[x_axis].str.replace('editorChoice','Editor choice')
    fig = plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    if hue_var != '':
        ax = sns.boxplot(x=variable, y=x_axis, hue=hue_var.title(), data=df, orient="h", palette=seq_col_brew[1:], width=0.4)
        adjust_box_widths(fig, 0.8)
        ax.set(xlabel='Days since last update', ylabel='')
        plt.setp(ax.get_legend().get_texts(), fontsize='20')
        plt.setp(ax.get_legend().get_title(), fontsize='20')
    else:
        ax = sns.boxplot(x=variable, y=x_axis, data=df, orient="h", palette=seq_col_brew[1:], width=0.2)
        ax.set(xlabel='Rating', ylabel='')
    
    size=20
    params = {'legend.fontsize': 'large',
          'figure.figsize': (20,8),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.95,
          'ytick.labelsize': size*0.95,
          'axes.titlepad': 25}
    plt.rcParams.update(params)
        
def plot_boxplot_single_gray(df_big: pd.DataFrame, variable: str, score: float):
    df = df_big[[variable]]
    df = df.dropna(subset=[variable])
    z_scores = np.abs(zscore(df[variable]))
    filtered_entries = np.where(z_scores < score)
    df = df.iloc[filtered_entries]
    plt.figure(figsize=(16, 10))
    sns.set_style("whitegrid")
    
    ax = sns.boxplot(y=variable, data=df, palette=seq_col_brew[1:])
    for box in ax.artists:
        box.set_facecolor("white")

#### To generate possible pairs

In [None]:
def generate_pairs(source: list) -> list:
    result = []
    for p1 in range(len(source)):
        for p2 in range(p1+1,len(source)):
            result.append([source[p1],source[p2]])
    return result

#### To generate pvalues and cliffs delta

In [None]:
def generate_pval_dataframe(list_keys: list, df_big: pd.DataFrame, value: str, num_variable: str) -> pd.DataFrame:
    df = pd.DataFrame(columns=['c1', 'c2', 'pvalue', 'd', 'size'])
    list_pairs = generate_pairs(list_keys)
    
    for pair in list_pairs:
        c1 = pair[0]
        c2 = pair[1]
        
        x = df_big.loc[df_big[value] == c1, num_variable].to_list()
        y = df_big.loc[df_big[value] == c2, num_variable].to_list()
        
        Hmw, pval = mannwhitneyu(x, y, alternative='two-sided')
        d, size = cliffsDelta(x, y)
        
        df = df.append(
            {'c1': c1, 
             'c2': c2, 
             'pvalue': pval, 
             'd': d, 
             'size': size}, ignore_index=True)
        
    reject, pval_corrected, a_s, a_b = multipletests(df['pvalue'].to_list(), alpha=0.05, method='holm', is_sorted=False, returnsorted=False)
    df['reject'] = reject
    df['pvalue_corrected'] = pval_corrected
    
    return df    

def generate_pval_dataframe_two(list_keys: list, df_big: pd.DataFrame, value_1: str, value_2:str, num_variable: str) -> pd.DataFrame:
    df = pd.DataFrame(columns=['c1', 'c2', 'pvalue', 'd', 'size'])
    list_pairs = generate_pairs(list_keys)
    
    for pair in list_pairs:
        c1 = pair[0]
        c1_1 = c1[0]
        c1_2 = c1[1]
        
        c2 = pair[1]
        c2_1 = c2[0]
        c2_2 = c2[1]
        
        x = df_big.loc[(df_big[value_1] == c1_1) & (df_big[value_2] == c1_2), num_variable].to_list()
        y = df_big.loc[(df_big[value_1] == c2_1) & (df_big[value_2] == c2_2), num_variable].to_list()
        
        Hmw, pval = mannwhitneyu(x, y, alternative='two-sided')
        d, size = cliffsDelta(x, y)
        
        df = df.append(
            {'c1': c1, 
             'c2': c2, 
             'pvalue': pval, 
             'd': d, 
             'size': size}, ignore_index=True)
        
    reject, pval_corrected, a_s, a_b = multipletests(df['pvalue'].to_list(), alpha=0.05, method='holm', is_sorted=False, returnsorted=False)
    df['reject'] = reject
    df['pvalue_corrected'] = pval_corrected
    
    return df    

# Analysis presented in the paper

In [None]:
# Rename columns
only_vars.columns = ['id','len_name','len_summary', 'len_description', 'last_update_date', 
                     'content_rating', 'rating', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5',
                     'num_installs', 'num_installs_num', 'has_whats_new', 'last_update_days', 'android_version',
                     'num_week', 'price', 'price_usd', 'genre', 'category', 'top', 'country']

only_vars.head(3)

In [None]:
# Save copy of only vars 
only_vars_original = only_vars.copy()

In [None]:
# Add weight column based on app id
only_vars = generate_weight_column(only_vars, 'general')
only_vars.head(3)

## Categorical variables

### Content Rating

In [None]:
content_rating = only_vars['content_rating']
content_rating_dict = generate_dict(content_rating.to_list())
pp.pprint(content_rating_dict)
generate_graph(content_rating_dict, 'content rating')

### Android Version

In [None]:
macro_android_version = only_vars['android_version']
macro_android_version_dict = generate_dict(macro_android_version.to_list())
pp.pprint(macro_android_version_dict)
generate_graph(macro_android_version_dict, 'android version')

### Genre

In [None]:
genre = only_vars['genre']
genre_dict = generate_dict(genre.to_list())
pp.pprint(genre_dict)
generate_graph(genre_dict, 'genre')

### Has What's New

In [None]:
has_whats_new = only_vars['has_whats_new']
has_whats_new_dict = generate_dict(has_whats_new.to_list())
pp.pprint(has_whats_new_dict)
generate_pie_chart(has_whats_new_dict, "has whats new")

### Num Installs

In [None]:
fixed_num_installs = only_vars['num_installs']
fixed_num_installs_dict = generate_dict(fixed_num_installs.to_list())
pp.pprint(fixed_num_installs_dict)
generate_graph(fixed_num_installs_dict, 'num installs')

## Numerical variables

### Numerical variables stats

In [None]:
numeric_variables = ['len_name', 'len_summary', 'len_description', 
                     'rating', 'rating_1','rating_2', 'rating_3', 'rating_4', 'rating_5', 
                     'last_update_days', 'price_usd','num_installs_num']

data_general = {
    'variable': [],
    'group':[],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': []
}

for nv in numeric_variables:
    get_stats(nv, data_general, only_vars, 'weight_general', 'general')

In [None]:
numeric_general = pd.DataFrame(data_general)
pd.options.display.float_format = "{:.2f}".format
numeric_general

In [None]:
for nv in numeric_variables:
    plot_boxplot_single(only_vars, nv, 2)

## Grouped by Country

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_country = only_vars.groupby(['country'])
categorical_variables = ['content_rating', 'android_version']

data_country = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': []
}

content_rating_dicts = []
android_version_dicts = []
has_whats_new_dicts = []
genre_dicts = []
country_labels = []
    
# Iterate over country groups to plot variables and get stats
for key in groups_country.groups.keys():
    df = groups_country.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    country_labels.append(key)
    ct_dict = generate_dict(df['content_rating'].to_list())
    content_rating_dicts.append(ct_dict)
    
    av_dict = generate_dict(df['android_version'].to_list())
    android_version_dicts.append(av_dict)
    
    hw_dict = generate_dict(df['has_whats_new'].to_list())
    has_whats_new_dicts.append(hw_dict)
    
    g_dict = generate_dict(df['genre'].to_list())
    genre_dicts.append(g_dict)
    
    for nv in numeric_variables:
        get_stats(nv, data_country, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts, country_labels, 'Content Rating')

In [None]:
generate_horizontal_bar_chart(android_version_dicts, country_labels, 'Android Versions')

In [None]:
generate_horizontal_bar_chart(genre_dicts, country_labels, 'Genre')

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts, country_labels, 'Has Whats New')

In [None]:
has_whats_new_week = pd.DataFrame()
for label, d in zip(country_labels, has_whats_new_dicts):
    d['country'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_week = has_whats_new_week.append(d, ignore_index=True)
    
has_whats_new_week

### Numerical variables

In [None]:
numeric_data_country = pd.DataFrame(data_country)
numeric_data_country =numeric_data_country.sort_values(by=['variable', 'group'])
numeric_data_country

### Len name

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_name', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'len_name')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Len summary

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_summary', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'len_summary')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Len description

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_description', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'len_description')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Rating

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'rating')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Rating 1

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_1', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'rating_1')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Rating 2

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_2', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'rating_2')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Rating 3

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_3', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'rating_3')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Rating 4

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_4', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'rating_4')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Rating 5

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_5', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'rating_5')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Last update days

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'last_update_days', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'last_update_days')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Price usd

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'price_usd', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'price_usd')
country_pvalues

In [None]:
d =generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

### Num installs

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'num_installs_num', 'country', 2)
country_pvalues = generate_pval_dataframe(list(groups_country.groups.keys()), only_vars, 'country', 'num_installs_num')
country_pvalues

In [None]:
d = generate_dict(country_pvalues['size'].to_list())
pp.pprint(d)

## Grouped by top

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_top = only_vars.groupby(['top'])
categorical_variables = ['content_rating', 'android_version']

data_top = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': []
}

content_rating_dicts = []
android_version_dicts = []
has_whats_new_dicts = []
genre_dicts = []
top_labels = []
    
# Iterate over top groups to plot variables and get stats
for key in groups_top.groups.keys():
    df = groups_top.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    top_labels.append(key)
    content_rating_dicts.append(generate_dict(df['content_rating'].to_list()))
    android_version_dicts.append(generate_dict(df['android_version'].to_list()))
    has_whats_new_dicts.append(generate_dict(df['has_whats_new'].to_list()))
    genre_dicts.append(generate_dict(df['genre'].to_list()))
    
    for nv in numeric_variables:
        get_stats(nv, data_top, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts, top_labels, 'Content Rating')

In [None]:
generate_horizontal_bar_chart(android_version_dicts, top_labels, 'Android Versions')

In [None]:
generate_horizontal_bar_chart(genre_dicts, top_labels, 'Genre')

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts, top_labels, 'Has Whats New')

In [None]:
has_whats_new_week = pd.DataFrame()
for label, d in zip(top_labels, has_whats_new_dicts):
    d['top'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_week = has_whats_new_week.append(d, ignore_index=True)
    
has_whats_new_week

### Numerical variables

In [None]:
numeric_data_top = pd.DataFrame(data_top)
numeric_data_top = numeric_data_top.sort_values(by=['variable', 'group'])

numeric_data_top

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_name', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'len_name')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_summary', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'len_summary')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_description', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'len_description')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray_top(only_vars, 'rating', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'rating')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_1', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'rating_1')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_2', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'rating_2')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_3', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'rating_3')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_4', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'rating_4')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_5', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'rating_5')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'last_update_days', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'last_update_days')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'price_usd', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'price_usd')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'num_installs_num', 'top', 2)
top_pvalues = generate_pval_dataframe(list(groups_top.groups.keys()), only_vars, 'top', 'num_installs_num')
top_pvalues

In [None]:
d = generate_dict(top_pvalues['size'].to_list())
pp.pprint(d)

## Grouped by category

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
# Exclude editor choice and general because they are not categories
only_vars = only_vars[~only_vars['category'].isin(['editorChoice', 'general'])]

In [None]:
groups_category = only_vars.groupby(['category'])
categorical_variables = ['content_rating', 'android_version']

data_category = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': []
}

content_rating_dicts = []
android_version_dicts = []
has_whats_new_dicts = []
category_labels = []
    
# Iterate over top groups to plot variables and get stats
for key in groups_category.groups.keys():
    df = groups_category.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    category_labels.append(key)
    content_rating_dicts.append(generate_dict(df['content_rating'].to_list()))
    android_version_dicts.append(generate_dict(df['android_version'].to_list()))
    has_whats_new_dicts.append(generate_dict(df['has_whats_new'].to_list()))
    
    for nv in numeric_variables:
        get_stats(nv, data_category, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts, category_labels, 'Content Rating')

In [None]:
generate_horizontal_bar_chart(android_version_dicts, category_labels, 'Android Versions')

In [None]:
generate_pie_chart_dropdown(has_whats_new_dicts, category_labels, 'Has Whats New')

In [None]:
has_whats_new_week = pd.DataFrame()
for label, d in zip(category_labels, has_whats_new_dicts):
    d['category'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_week = has_whats_new_week.append(d, ignore_index=True)
    
has_whats_new_week

### Numerical variables

In [None]:
numeric_data_category = pd.DataFrame(data_category)
numeric_data_category = numeric_data_category.sort_values(by=['variable', 'group'])

pp.pprint(numeric_data_category)

numeric_variables = ['len_name', 'len_summary', 'len_description', 
                     'rating', 'rating_1','rating_2', 'rating_3', 'rating_4', 'rating_5', 
                     'last_update_days', 'price_usd','num_installs_num']

for nv in numeric_variables:
    plot_boxplot_horizontal_gray(only_vars, nv, 'category', 2)
    print(f'Numeric variable {nv}')
    category_values = generate_pval_dataframe(list(groups_category.groups.keys()), only_vars, 'category', nv)
    pp.pprint(category_values)
    d = generate_dict(category_values['size'].to_list())
    pp.pprint(d)

## Grouped by country-top

### Categorical variables

In [None]:
# Make a copy of original only vars
only_vars = only_vars_original.copy()

In [None]:
groups_top_country = only_vars.groupby(['top', 'country'])
categorical_variables = ['content_rating', 'android_version']

data_top_country = {
    'variable': [],
    'group': [],
    'weighted_mean': [],
    'mean': [],
    'min': [],
    'q1': [],
    'median': [],
    'q3': [],
    'max': []
}

content_rating_dicts = []
android_version_dicts = []
has_whats_new_dicts = []
genre_dicts = []
top_country_labels = []
    
# Iterate over top groups to plot variables and get stats
for key in groups_top_country.groups.keys():
    df = groups_top_country.get_group(key)
    # Generate a weight column for each group
    df = generate_weight_column(df, key)
    
    top_country_labels.append(key)
    content_rating_dicts.append(generate_dict(df['content_rating'].to_list()))
    android_version_dicts.append(generate_dict(df['android_version'].to_list()))
    has_whats_new_dicts.append(generate_dict(df['has_whats_new'].to_list()))
    genre_dicts.append(generate_dict(df['genre'].to_list()))
    
    for nv in numeric_variables:
        get_stats(nv, data_top_country, df, f'weight_{key}', key)

In [None]:
generate_horizontal_bar_chart(content_rating_dicts, top_country_labels, 'Content Rating')

In [None]:
generate_horizontal_bar_chart(android_version_dicts, top_country_labels, 'Android Versions')

In [None]:
generate_horizontal_bar_chart(genre_dicts, top_country_labels, 'Genre')

In [None]:
has_whats_new_week = pd.DataFrame()
for label, d in zip(top_country_labels, has_whats_new_dicts):
    d['top-country'] = label
    total = d[True] + d[False]
    d['percentage_true'] = (d[True]/total)*100
    d['percentage_false'] = (d[False]/total)*100
    
    has_whats_new_week = has_whats_new_week.append(d, ignore_index=True)
    
has_whats_new_week

In [None]:
numeric_data_top_country = pd.DataFrame(data_top_country)
numeric_data_top_country = numeric_data_top_country.sort_values(by=['variable', 'group'])

numeric_data_top_country

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_name', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'len_name')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_summary', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'len_summary')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'len_description', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'len_description')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'rating')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_1', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'rating_1')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_2', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'rating_2')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_3', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'rating_3')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_4', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'rating_4')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'rating_5', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'rating_5')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray_top(only_vars, 'last_update_days', 'top', 2, hue_var='country')

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'price_usd', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'price_usd')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)

In [None]:
plot_boxplot_horizontal_gray(only_vars, 'num_installs_num', 'top', 2, hue_var='country')
top_country_pvalues = generate_pval_dataframe_two(list(groups_top_country.groups.keys()), only_vars, 'top', 'country', 'num_installs_num')
top_country_pvalues

In [None]:
d = generate_dict(top_country_pvalues['size'].to_list())
pp.pprint(d)