In [1]:
##################################################################
#《Python机器学习及实践：从零开始通往Kaggle竞赛之路（2024年度版）》开源代码
#-----------------------------------------------------------------
#                 @章节号：7.3.2.1（特征标准化）                           
#                 @作者：范淼、徐晟桐 
#                 @购书链接：https://item.jd.com/13482761.html
#                 @电子邮箱：fanmiao.cslt.thu@hotmail.com               
#                 @官方交流QQ群号：561500762                        
##################################################################

In [2]:
from pyspark.sql import SparkSession


#创建SparkSession。
spark = SparkSession.builder.getOrCreate()

#读取文件并存储到DataFrame中。
df = spark.read.csv('../Datasets/news/news_sentiment.csv', header=False)

#选取名称为_c1的列，并展示该列的前5行。
df.select(df._c1).show(5)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/21 12:55:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+--------------------+
|                 _c1|
+--------------------+
|According to Gran...|
|Technopolis plans...|
|The international...|
|With the new prod...|
|According to the ...|
+--------------------+
only showing top 5 rows



In [3]:
import pyspark.sql.functions as func


#选取名称为_c1的列，将该列的数据文本进行分词，并修改该列的名称为words。
df = df.select(func.split(df._c1, ' ').alias('words'))

df.show(5)

+--------------------+
|               words|
+--------------------+
|[According, to, G...|
|[Technopolis, pla...|
|[The, internation...|
|[With, the, new, ...|
|[According, to, t...|
+--------------------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import Word2Vec


#初始化词向量特征的抽取模型。
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="features")

model = word2Vec.fit(df)

word2vec_df = model.transform(df)

word2vec_df.show(5)

23/02/21 12:55:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/02/21 12:55:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
                                                                                

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[According, to, G...|[-0.1210632427595...|
|[Technopolis, pla...|[-0.1742415400297...|
|[The, internation...|[-0.1379980493285...|
|[With, the, new, ...|[-0.1015461966480...|
|[According, to, t...|[-0.3125891150656...|
+--------------------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import StandardScaler


#初始化特征标准化模型。
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

scalerModel = scaler.fit(word2vec_df)

scaled_df = scalerModel.transform(word2vec_df)

scaled_df.show(5)

+--------------------+--------------------+--------------------+
|               words|            features|      scaledFeatures|
+--------------------+--------------------+--------------------+
|[According, to, G...|[-0.1210632427595...|[-0.8783140231768...|
|[Technopolis, pla...|[-0.1742415400297...|[-1.2641226563868...|
|[The, internation...|[-0.1379980493285...|[-1.0011760723857...|
|[With, the, new, ...|[-0.1015461966480...|[-0.7367178218855...|
|[According, to, t...|[-0.3125891150656...|[-2.2678345383488...|
+--------------------+--------------------+--------------------+
only showing top 5 rows

