In [1]:
import pyarrow.parquet as pq
import pandas as pd
from concurrent import futures
#from tqdm import tqdm
import os
import json
import time
import gc
from collections import defaultdict
from datetime import datetime 
import matplotlib.pyplot as plt
import matplotlib.style as psl
%matplotlib inline

%config InlineBakend.figure_format = 'svg'
plt.rcParams["axes.unicode_minus"] = False
plt.rcParams["font.sans-serif"]=["Microsoft YaHei"]
psl.use('ggplot')

对用户的行为时间进行排序

In [2]:
def apply(file_names,operation,batchsize):
    """
        处理单行或是多行数据，输出为Series或是Dataframe
    """
    def file_process(file_name):
        """
            内函数，定义单个处理过程
        """
        assert os.path.exists(file_name),f"文件{file_name}访问错误"
        assert file_name.endswith('.parquet'),f"文件{file_name}格式错误"
        file_data = pq.ParquetFile(file_name)
        results = []
        for batch in file_data.iter_batches(batch_size = batchsize):
            df = batch.to_pandas()
            result = operation(df)
            results.append(result)
        results = merge_dic_data(results)
        return results
    workers = min(20,len(file_names))
    with futures.ThreadPoolExecutor(workers) as executor:
        results = executor.map(file_process,file_names)
    all_results = merge_dic_data(results)
    # 此处按照第一列进行排序
    result = sorted(all_results.items(),key=lambda x:x[0])
    return result
def merge_dic_data(dict_list):
    merged = defaultdict(int)
    for dict in dict_list:
        for key,value in dict.items():
            merged[key] += value
    return merged
file_names_1G = [f'./data/1G_data/part-000{x:02d}.parquet' for x in range(8)]
file_names_10G = [f'./data/10G_data/part-000{x:02d}.parquet' for x in range(8)]
file_names_30G = [f'./data/30G_data/part-000{x:02d}.parquet' for x in range(16)]


下面进行流量分析

先是对日期维度进行分析

PV

In [3]:
data_name = file_names_30G

In [4]:
def last_login(df):
    date_count = defaultdict(int)
    for date_str in df["last_login"]:
        try:
            date_obj = datetime.fromisoformat(date_str)
            day = date_obj.date()
            date_count[day] +=1
        except ValueError:
            print(f"Date parsing error for date_str: {date_str}")
    return date_count

start = time.time()
date_data = apply(data_name,last_login,100000)
end = time.time()
print(f"Time taken: {end - start}")


Time taken: 417.31183528900146


In [5]:
dates = [date for date, _ in date_data]
login_counts = [count for _, count in date_data]

# 绘制折线图
plt.close("all")
plt.figure(figsize=(10, 6))
plt.plot(dates, login_counts, color='blue')
plt.title('单日登录')
plt.xlabel('日期')
plt.xticks(rotation=45)
plt.ylabel('登录人次')
plt.grid(True)
plt.tight_layout()

# 显示图形
#plt.show()
plt.savefig("./images/day_30G.png")

In [6]:
def login_time(df):
    hour_count = defaultdict(int)
    for date_str in df["last_login"]:
        try:
            date_obj = datetime.fromisoformat(date_str)
            hour = date_obj.hour
            hour_count[hour] +=1
        except ValueError:
            print(f"Date parsing error for date_str: {date_str}")
    return hour_count

start = time.time()
hour_data = apply(data_name,login_time,100000)
end = time.time()
print(f"Time taken: {end - start}")

Time taken: 365.02767515182495


In [7]:
hours = [hour for hour, _ in hour_data]
hour_login_counts = [count for _, count in hour_data]

# 绘制折线图
plt.close("all")
plt.figure(figsize=(10, 6))
plt.plot(hours, hour_login_counts, color='blue')
plt.title('单小时登录')
plt.xlabel('时间点（小时）')
plt.xticks(rotation=45)
plt.ylabel('登录人次')
plt.grid(True)
plt.tight_layout()

# 显示图形
#plt.show()
plt.savefig("./images/hour_30G.png")

#### 用户活跃度分析 

In [8]:
def active_user(df):
    active_count = {"one month":0,"three month":0,"six month":0,"one year":0,"longer":0}
    for login,register in zip(df["last_login"],df["registration_date"]):
        try:
            date_obj = datetime.fromisoformat(login)
            login_data = date_obj.date()
            register_date = datetime.strptime(register,"%Y-%m-%d").date()
            gap = login_data - register_date
            if gap.days < 30:
                active_count["one month"] += 1
            elif gap.days < 90:
                active_count["three month"] += 1
            elif gap.days < 180:
                active_count["six month"] += 1
            elif gap.days < 365:
                active_count["one year"] += 1
            else:
                active_count["longer"] += 1
        except ValueError:
            print(f"Date parsing error for date_str: {login}")
    return active_count

start = time.time()
active_data = apply(data_name,active_user,100000)
end = time.time()
print(f"Time taken: {end - start}")

Time taken: 3404.5142107009888


In [9]:
labels = {"one month":"一月内","three month":"三月内","six month":"六月内","one year":"一年内","longer":"长期"}
labels = [labels[p] for p,_ in active_data]
x =  [q for _,q in active_data]
plt.close('all')
plt.pie(x,
        labels = labels,
        autopct = '%.1f%%',
        pctdistance = 0.6,
        startangle = 90,
        #explode = [0,0.01,0,0.01,0],
        #rotatelabels=True,
        textprops = {'fontsize':12,'color':'k'}
       )
plt.title("活跃用户占比")

plt.savefig("./images/active_30G.png")
