In [1]:
from pyspark.sql import SparkSession


#创建SparkSession。
spark = SparkSession.builder.getOrCreate()

#读取文件并存储到DataFrame中。
df = spark.read.csv('../Datasets/news/news_sentiment.csv', header=False)

#选取名称为_c1的列，并展示该列的前5行。
df.select(df._c1).show(5)

21/10/12 19:32:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/12 19:32:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/10/12 19:32:12 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
21/10/12 19:32:12 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
                                                                                

+--------------------+
|                 _c1|
+--------------------+
|According to Gran...|
|Technopolis plans...|
|The international...|
|With the new prod...|
|According to the ...|
+--------------------+
only showing top 5 rows



In [2]:
import pyspark.sql.functions as func


#选取名称为_c1的列，将该列的数据文本进行分词，并修改该列的名称为words。
df = df.select(func.split(df._c1, ' ').alias('words'))

df.show(5)

+--------------------+
|               words|
+--------------------+
|[According, to, G...|
|[Technopolis, pla...|
|[The, internation...|
|[With, the, new, ...|
|[According, to, t...|
+--------------------+
only showing top 5 rows



In [3]:
from pyspark.ml.feature import CountVectorizer


#初始化文本词频特征的抽取模型。
cv = CountVectorizer(inputCol="words", outputCol="word_counts", vocabSize=100)

model = cv.fit(df)

result_df = model.transform(df)

#将分布式数据集中存储到内存变量results中。
results = result_df.collect()

#展示前3行文本的词频特征。
for items in results[:3]:
    print (items)

                                                                                

Row(words=['According', 'to', 'Gran', ',', 'the', 'company', 'has', 'no', 'plans', 'to', 'move', 'all', 'production', 'to', 'Russia', ',', 'although', 'that', 'is', 'where', 'the', 'company', 'is', 'growing', '.'], word_counts=SparseVector(100, {0: 1.0, 1: 2.0, 2: 2.0, 6: 3.0, 11: 2.0, 14: 2.0, 18: 1.0, 31: 1.0, 85: 1.0}))
Row(words=['Technopolis', 'plans', 'to', 'develop', 'in', 'stages', 'an', 'area', 'of', 'no', 'less', 'than', '100,000', 'square', 'meters', 'in', 'order', 'to', 'host', 'companies', 'working', 'in', 'computer', 'technologies', 'and', 'telecommunications', ',', 'the', 'statement', 'said', '.'], word_counts=SparseVector(100, {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 3.0, 5: 1.0, 6: 2.0, 21: 1.0, 39: 1.0, 90: 1.0, 96: 1.0}))
Row(words=['The', 'international', 'electronic', 'industry', 'company', 'Elcoteq', 'has', 'laid', 'off', 'tens', 'of', 'employees', 'from', 'its', 'Tallinn', 'facility', ';', 'contrary', 'to', 'earlier', 'layoffs', 'the', 'company', 'contracted', 'the', 