In [1]:
from pyspark.sql import SparkSession


#创建SparkSession。
spark = SparkSession.builder.getOrCreate()

#读取文件并存储到DataFrame中。
df = spark.read.csv('../Datasets/news/news_sentiment.csv', header=False)

#选取名称为_c1的列，并展示该列的前5行。
df.select(df._c1).show(5)

21/10/12 19:53:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/12 19:53:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/10/12 19:53:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
                                                                                

+--------------------+
|                 _c1|
+--------------------+
|According to Gran...|
|Technopolis plans...|
|The international...|
|With the new prod...|
|According to the ...|
+--------------------+
only showing top 5 rows



In [2]:
import pyspark.sql.functions as func


#选取名称为_c1的列，将该列的数据文本进行分词，并修改该列的名称为words。
df = df.select(func.split(df._c1, ' ').alias('words'))

df.show(5)

+--------------------+
|               words|
+--------------------+
|[According, to, G...|
|[Technopolis, pla...|
|[The, internation...|
|[With, the, new, ...|
|[According, to, t...|
+--------------------+
only showing top 5 rows



In [3]:
from pyspark.ml.feature import Word2Vec


#初始化词向量特征的抽取模型。
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="features")

model = word2Vec.fit(df)

word2vec_df = model.transform(df)

word2vec_df.show(5)

21/10/12 19:53:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/10/12 19:53:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[According, to, G...|[-0.0177427710080...|
|[Technopolis, pla...|[-0.0224827738899...|
|[The, internation...|[-0.0251569747043...|
|[With, the, new, ...|[-0.1424257911628...|
|[According, to, t...|[0.03129597339869...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import StandardScaler


#初始化特征标准化模型。
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

scalerModel = scaler.fit(word2vec_df)

scaled_df = scalerModel.transform(word2vec_df)

scaled_df.show(5)

+--------------------+--------------------+--------------------+
|               words|            features|      scaledFeatures|
+--------------------+--------------------+--------------------+
|[According, to, G...|[-0.0177427710080...|[-0.1282418935499...|
|[Technopolis, pla...|[-0.0224827738899...|[-0.1625018715843...|
|[The, internation...|[-0.0251569747043...|[-0.1818305647191...|
|[With, the, new, ...|[-0.1424257911628...|[-1.0294306983274...|
|[According, to, t...|[0.03129597339869...|[0.22620225934933...|
+--------------------+--------------------+--------------------+
only showing top 5 rows

