In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
def analyze(data_frame, col):
    """
    按照指定列分组统计
    :param data_frame: DataFrame
    :param col: 列名
    :return: DataFrame
    """
    return data_frame.groupby([col])[col].count() \
        .reset_index(name='num').sort_values(by='num', ascending=False)


def multi(data_frame, col):
    """
    指定多列分别分组统计
    :param data_frame: DataFrame
    :param col: 多列名list
    :return: dict
    """
    data = {}
    for c in col:
        new_index = 'num'
        data[c] = data_frame.groupby([c])[c].count() \
            .reset_index(name=new_index).sort_values(by=new_index, ascending=False)

    return data


def mixed_and_percent(data_frame, col, top=10, is_rate=True):
    """
    获取指定列的处理数据
    :param is_rate:
    :param data_frame: DataFrame
    :param col:
    :param top:
    :return: DataFrame
    """
    row = analyze(data_frame, col)

    if is_rate:
        row['rate'] = row['num'] / row.sum()['num']
        row['rate'] = row['rate'].map(lambda x: round(x, 4))

        # top
        row_top = row.head(top)
        row_top_sum = row_top.sum()

        # 插入'其他'行
        if row.shape[0] > top:
            obj = {}
            for k in row_top.columns.values:
                obj[k] = 'other'
            obj['rate'] = 1 - row_top_sum['rate']
            obj = pd.DataFrame(obj, index=[0])

            row = row_top.append(obj, ignore_index=True)

    return row


def rename(name):
    """
    设置sheet名称
    :param name: string
    :return: string
    """
    if len(name) > 31:
        return name[18:]
    return name