In [1]:
import pandas as pd
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql import functions as F, Window

# 配置集群
config = SparkConf()
config.set('spark.dynamicAllocation.maxExecutors', '8')
config.set('spark.driver.memory', '4G')
config.set('spark.executor.memory', '8G')
config.set('spark.executor.cores', '4')
config.set('spark.yarn.executora.memoryOverhead', '4G')
config.set('spark.sql.shuffle.partitions', '10')
config.set('spark.default.parallelism', '10')
config.set('spark.port.maxRetries', '1000')
config.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
config.set('spark.master','local[4]')

spark = SparkSession.builder.config(conf=config).getOrCreate()

In [2]:
spark.createDataFrame(
    [
        {"uid":"1","date":"2020-02-01","is_flag":"1"},
        {"uid":"1","date":"2020-02-02","is_flag":"0"},
        {"uid":"1","date":"2020-02-03","is_flag":"1"},
        {"uid":"1","date":"2020-02-04","is_flag":"1"},
        {"uid":"1","date":"2020-02-05","is_flag":"0"},
        {"uid":"1","date":"2020-02-06","is_flag":"1"},
        {"uid":"1","date":"2020-02-07","is_flag":"1"},
        {"uid":"1","date":"2020-02-08","is_flag":"1"},
        {"uid":"2","date":"2020-02-01","is_flag":"1"},
        {"uid":"2","date":"2020-02-02","is_flag":"0"},
        {"uid":"2","date":"2020-02-03","is_flag":"0"},
        {"uid":"2","date":"2020-02-04","is_flag":"1"},
        {"uid":"2","date":"2020-02-05","is_flag":"1"},
        {"uid":"2","date":"2020-02-06","is_flag":"1"},
        {"uid":"2","date":"2020-02-07","is_flag":"1"},
        {"uid":"2","date":"2020-02-08","is_flag":"1"}
    ]
).createOrReplaceTempView('base')



In [3]:
# 法一, 如需统计最大连续天数对于num求最大值即可
spark.sql("""
SELECT
    uid
    ,(datediff(t1.date, date('2020-02-01')) - index) as date_index_diff
    ,min(date) as start_date
    ,max(date) as end_date
    ,count(1) as num
FROM (
    SELECT
        uid
        ,date
        ,row_number() over(partition by uid order by date) - 1 as index
    FROM base
    WHERE is_flag=1
) as t1
GROUP BY 1, 2
""").cache().toPandas()

Unnamed: 0,uid,date_index_diff,start_date,end_date,num
0,1,0,2020-02-01,2020-02-01,1
1,1,1,2020-02-03,2020-02-04,2
2,1,2,2020-02-06,2020-02-08,3
3,2,0,2020-02-01,2020-02-01,1
4,2,2,2020-02-04,2020-02-08,5
