In [1]:
##################################################################
#           《Python人工智能编程实践（2024年度版）》开源代码
#-----------------------------------------------------------------
#            @章节号：7.3.2.2（特征向量化）           
#            @作者：范淼、徐晟桐 
#            @购书链接：暂无
#            @电子邮箱：fm12@tsinghua.org.cn             
#            @官方交流QQ群号：561500762                        
##################################################################

In [2]:
from pyspark.sql import SparkSession


#创建SparkSession。
spark = SparkSession.builder.getOrCreate()

#读取文件并存储到DataFrame中。
df = spark.read.csv('../Datasets/news/news_sentiment.csv', header=False)

#选取名称为_c0的列，并展示该列的前5行。
df = df.select(df._c0)

df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/31 17:33:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/31 17:33:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/08/31 17:33:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


+--------+
|     _c0|
+--------+
| neutral|
| neutral|
|negative|
|positive|
|positive|
+--------+
only showing top 5 rows



In [3]:
from pyspark.ml.feature import StringIndexer


#初始化类别到数字编码的特征转换模型。
si = StringIndexer(inputCol='_c0', outputCol='label_idx')

si_model = si.fit(df)

df = si_model.transform(df)

df.show(5)

+--------+---------+
|     _c0|label_idx|
+--------+---------+
| neutral|      0.0|
| neutral|      0.0|
|negative|      2.0|
|positive|      1.0|
|positive|      1.0|
+--------+---------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import OneHotEncoder


#初始化数字编码到独热向量表示的特征转换模型。
ohe = OneHotEncoder(inputCol="label_idx", outputCol="labels")

ohe_model = ohe.fit(df)

result = ohe_model.transform(df)

result.show(5)

+--------+---------+-------------+
|     _c0|label_idx|       labels|
+--------+---------+-------------+
| neutral|      0.0|(2,[0],[1.0])|
| neutral|      0.0|(2,[0],[1.0])|
|negative|      2.0|    (2,[],[])|
|positive|      1.0|(2,[1],[1.0])|
|positive|      1.0|(2,[1],[1.0])|
+--------+---------+-------------+
only showing top 5 rows

