In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("dataframe_split") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext
df = spark.read.csv('hdfs://master:9000/dataset/dataframe_split.csv', inferSchema=True, header=True)
df.show(3)

+---+-----------+
|gid|      score|
+---+-----------+
| a1|90 80 79 80|
| a2|79 89 45 60|
| a3|57 56 89 75|
+---+-----------+



### split

In [8]:
from pyspark.sql.functions import split, explode, concat, concat_ws
df_split = df.withColumn("s", split(df['score'], " "))
df_split.show()

+---+-----------+----------------+
|gid|      score|               s|
+---+-----------+----------------+
| a1|90 80 79 80|[90, 80, 79, 80]|
| a2|79 89 45 60|[79, 89, 45, 60]|
| a3|57 56 89 75|[57, 56, 89, 75]|
+---+-----------+----------------+



### zipWithIndex:给每个元素生成一个索引
#### 排序首先基于分区索引，然后是每个分区内的项目顺序．因此，第一个分区中的第一个item索引为０，最后一个分区中的最后一个item的索引最大．当RDD包含多个分区时此方法需要触发spark作业．

In [31]:
sc.parallelize(['a', 'b', 'c', 'd'], 3).zipWithIndex().collect()

[('a', 0), ('b', 1), ('c', 2), ('d', 3)]

### 将一列分割成多列

In [52]:
first_row = df.first()
numAttrs = len(first_row['score'].split(" "))
print("新增列的个数", numAttrs)
attrs = sc.parallelize(["score_" + str(i) for i in range(numAttrs)]).zipWithIndex().collect()
print("列名：", attrs)
for name, index in attrs:
    df_split = df_split.withColumn(name, df_split['s'].getItem(index))
df_split.show()

新增列的个数 4
列名： [('score_0', 0), ('score_1', 1), ('score_2', 2), ('score_3', 3)]
+---+-----------+----------------+-------+-------+-------+-------+
|gid|      score|               s|score_0|score_1|score_2|score_3|
+---+-----------+----------------+-------+-------+-------+-------+
| a1|90 80 79 80|[90, 80, 79, 80]|     90|     80|     79|     80|
| a2|79 89 45 60|[79, 89, 45, 60]|     79|     89|     45|     60|
| a3|57 56 89 75|[57, 56, 89, 75]|     57|     56|     89|     75|
+---+-----------+----------------+-------+-------+-------+-------+



### explode

In [16]:
df_explode = df.withColumn("e", explode(split(df['score'], " ")))
df_explode.show()

+---+-----------+---+
|gid|      score|  e|
+---+-----------+---+
| a1|90 80 79 80| 90|
| a1|90 80 79 80| 80|
| a1|90 80 79 80| 79|
| a1|90 80 79 80| 80|
| a2|79 89 45 60| 79|
| a2|79 89 45 60| 89|
| a2|79 89 45 60| 45|
| a2|79 89 45 60| 60|
| a3|57 56 89 75| 57|
| a3|57 56 89 75| 56|
| a3|57 56 89 75| 89|
| a3|57 56 89 75| 75|
+---+-----------+---+



### concat

In [49]:
df_concat = df_split.withColumn("score_concat", concat(df_split['score_0'], \
                                                       df_split['score_1'], df_split['score_2'], df_split['score_3']))
df_concat.show()

+---+-----------+----------------+-------+-------+-------+-------+------------+
|gid|      score|               s|score_0|score_1|score_2|score_3|score_concat|
+---+-----------+----------------+-------+-------+-------+-------+------------+
| a1|90 80 79 80|[90, 80, 79, 80]|     90|     80|     79|     80|    90807980|
| a2|79 89 45 60|[79, 89, 45, 60]|     79|     89|     45|     60|    79894560|
| a3|57 56 89 75|[57, 56, 89, 75]|     57|     56|     89|     75|    57568975|
+---+-----------+----------------+-------+-------+-------+-------+------------+



### cancat_ws

In [51]:
df_ws = df_split.withColumn("score_concat", concat_ws('-', df_split['score_0'], \
                                                       df_split['score_1'], df_split['score_2'], df_split['score_3']))
df_ws.show()

+---+-----------+----------------+-------+-------+-------+-------+------------+
|gid|      score|               s|score_0|score_1|score_2|score_3|score_concat|
+---+-----------+----------------+-------+-------+-------+-------+------------+
| a1|90 80 79 80|[90, 80, 79, 80]|     90|     80|     79|     80| 90-80-79-80|
| a2|79 89 45 60|[79, 89, 45, 60]|     79|     89|     45|     60| 79-89-45-60|
| a3|57 56 89 75|[57, 56, 89, 75]|     57|     56|     89|     75| 57-56-89-75|
+---+-----------+----------------+-------+-------+-------+-------+------------+



### pivot: 旋转当前[[dataframe]]列并执行指定的聚合

In [59]:
#DataFrame 数据格式:每个用户对每部电影的评分 userID 用户ID,movieID 电影ID,rating评分
df=spark.sparkContext.parallelize([[15,399,2], \
                                   [15,1401,5], \
                                   [15,1608,4], \
                                   [15,20,4], \
                                   [18,100,3], \
                                   [18,1401,3], \
                                   [18,399,1]])\
                    .toDF(["userID","movieID","rating"])
#pivot 多行转多列
resultDF = df.groupBy("userID").pivot("movieID").sum("rating").na.fill(-1)
#结果
resultDF.show()

+------+---+---+---+----+----+
|userID| 20|100|399|1401|1608|
+------+---+---+---+----+----+
|    18| -1|  3|  1|   3|  -1|
|    15|  4| -1|  2|   5|   4|
+------+---+---+---+----+----+

