In [1]:
##################################################################
#           《Python人工智能编程实践（2024年度版）》开源代码
#-----------------------------------------------------------------
#            @章节号：7.3.2.1（特征标准化）              
#            @作者：范淼、徐晟桐 
#            @购书链接：暂无
#            @电子邮箱：fm12@tsinghua.org.cn             
#            @官方交流QQ群号：561500762                        
##################################################################

In [2]:
from pyspark.sql import SparkSession


#创建SparkSession。
spark = SparkSession.builder.getOrCreate()

#读取文件并存储到DataFrame中。
df = spark.read.csv('./datasets/news/news_sentiment.csv', header=False)

#选取名称为_c1的列，并展示该列的前5行。
df.select(df._c1).show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/31 17:32:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/31 17:32:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+--------------------+
|                 _c1|
+--------------------+
|According to Gran...|
|Technopolis plans...|
|The international...|
|With the new prod...|
|According to the ...|
+--------------------+
only showing top 5 rows



In [3]:
import pyspark.sql.functions as func


#选取名称为_c1的列，将该列的数据文本进行分词，并修改该列的名称为words。
df = df.select(func.split(df._c1, ' ').alias('words'))

df.show(5)

+--------------------+
|               words|
+--------------------+
|[According, to, G...|
|[Technopolis, pla...|
|[The, internation...|
|[With, the, new, ...|
|[According, to, t...|
+--------------------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import Word2Vec


#初始化词向量特征的抽取模型。
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="features")

model = word2Vec.fit(df)

word2vec_df = model.transform(df)

word2vec_df.show(5)

23/08/31 17:33:02 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


CodeCache: size=131072Kb used=20360Kb max_used=20477Kb free=110711Kb
 bounds [0x0000000104a18000, 0x0000000105e38000, 0x000000010ca18000]
 total_blobs=8630 nmethods=7529 adapters=1013
 compilation: disabled (not enough contiguous free space left)
+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[According, to, G...|[-0.0747085128724...|
|[Technopolis, pla...|[-0.1309216530813...|
|[The, internation...|[-0.1186600272501...|
|[With, the, new, ...|[-0.0238454712724...|
|[According, to, t...|[-0.2550367626656...|
+--------------------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import StandardScaler


#初始化特征标准化模型。
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

scalerModel = scaler.fit(word2vec_df)

scaled_df = scalerModel.transform(word2vec_df)

scaled_df.show(5)

+--------------------+--------------------+--------------------+
|               words|            features|      scaledFeatures|
+--------------------+--------------------+--------------------+
|[According, to, G...|[-0.0747085128724...|[-0.4666851224387...|
|[Technopolis, pla...|[-0.1309216530813...|[-0.8178343450959...|
|[The, internation...|[-0.1186600272501...|[-0.7412390799474...|
|[With, the, new, ...|[-0.0238454712724...|[-0.1489566081896...|
|[According, to, t...|[-0.2550367626656...|[-1.5931499401440...|
+--------------------+--------------------+--------------------+
only showing top 5 rows

