In [1]:
##################################################################
#           《Python人工智能编程实践（2024年度版）》开源代码
#-----------------------------------------------------------------
#            @章节号：7.3.1.2（TF-IDF特征）                
#            @作者：范淼、徐晟桐 
#            @购书链接：暂无
#            @电子邮箱：fm12@tsinghua.org.cn             
#            @官方交流QQ群号：561500762                        
##################################################################

In [2]:
from pyspark.sql import SparkSession


#创建SparkSession。
spark = SparkSession.builder.getOrCreate()

#读取文件并存储到DataFrame中。
df = spark.read.csv('../Datasets/news/news_sentiment.csv', header=False)

#选取名称为_c1的列，将该列的数据文本进行分词，并修改该列的名称为words。
df = df.select(df._c0.alias('labels'), df._c1.alias('texts'))

df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/31 14:44:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/31 14:44:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/08/31 14:44:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/08/31 14:44:14 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


+--------+--------------------+
|  labels|               texts|
+--------+--------------------+
| neutral|According to Gran...|
| neutral|Technopolis plans...|
|negative|The international...|
|positive|With the new prod...|
|positive|According to the ...|
+--------+--------------------+
only showing top 5 rows



In [3]:
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer


#将文本中的句子分割为词汇。
tokenizer = Tokenizer(inputCol='texts', outputCol='words')

wordsData = tokenizer.transform(df)

#初始化词频（tf）特征抽取模型。
countVec = CountVectorizer(inputCol='words', outputCol='tf_features', vocabSize=100)

tf_model = countVec.fit(wordsData)

featurizedData = tf_model.transform(wordsData)

#初始化idf特征抽取模型。
idf = IDF(inputCol='tf_features', outputCol='tfidf_features')

idfModel = idf.fit(featurizedData)

result_df = idfModel.transform(featurizedData)

#将分布式数据的部分列，并集中存储到内存变量results中。
results = result_df.select('texts','tf_features', 'tfidf_features').collect()

#展示前3行文本的tf与tfidf特征。
for items in results[:3]:
    print (items)

                                                                                

Row(texts='According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', tf_features=SparseVector(100, {0: 2.0, 1: 1.0, 2: 2.0, 6: 3.0, 11: 2.0, 13: 2.0, 17: 1.0, 32: 1.0, 83: 1.0, 88: 1.0}), tfidf_features=SparseVector(100, {0: 0.6811, 1: 0.0137, 2: 1.248, 6: 2.6302, 11: 3.4011, 13: 3.6055, 17: 2.1564, 32: 2.4822, 83: 3.6903, 88: 3.6821}))
Row(texts='Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .', tf_features=SparseVector(100, {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 3.0, 5: 1.0, 6: 2.0, 20: 1.0, 38: 1.0, 91: 1.0}), tfidf_features=SparseVector(100, {0: 0.3406, 1: 0.0137, 2: 0.624, 3: 0.7476, 4: 2.3765, 5: 0.9008, 6: 1.7535, 20: 2.1872, 38: 2.7335, 91: 3.6986}))
Row(texts='The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility