In [21]:
sc

In [22]:
sc.master

'yarn'

In [23]:
sc.setCheckpointDir('hdfs://zh:9000/checkpoint')

In [583]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from pyspark.mllib.feature import HashingTF, IDF, Word2Vec
from pyspark.mllib.linalg import Vector, Vectors
import math
import numpy as np
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.evaluation import MulticlassMetrics 
from pyspark.mllib.regression import LabeledPoint

* nltk还是要用sudo的方式安装，否则worker又找不到

### 从数据中抽取合适的特征

* TF-IDF

* 特征哈希

### 从20新闻组数据集中提取TF-IDF特征

#### 1、载入新闻组数据

In [25]:
newsgroups = sc.wholeTextFiles('file:///home/zh/Spark_ML_Experiment/20news-bydate/20news-bydate-train/*')

* 并没有将文件放入hdfs中读取，大量的小文件不好放进去
* 载入本地文件必须使用file:///前缀， 否则按照默认的hdfs schema读取文件

In [27]:
newsgroups.take(1)

[('file:/home/zh/Spark_ML_Experiment/20news-bydate/20news-bydate-train/talk.politics.guns/54722',
  "From: cathyf@is.rice.edu (Catherine Anne Foulston)\nSubject: Re: WACO: Clinton press conference, part 1\nOrganization: Rice University\nLines: 6\n\nCould y'all PLEASE stop posting this stuff to tx.general.  tx.politics\nis sufficient and is where this stuff belongs.  Thanks.\n\n\tCathy\n-- \nCathy Foulston + Rice University + Network & Systems Support + cathyf@rice.edu\n")]

In [None]:
newsgroups.count()

In [28]:
countByGroup = newsgroups.map(lambda x:x[0].split('/')[-2]).map(lambda x:(x, 1)).countByKey()

In [29]:
countByGroup

defaultdict(int,
            {'alt.atheism': 480,
             'comp.graphics': 584,
             'comp.os.ms-windows.misc': 591,
             'comp.sys.ibm.pc.hardware': 590,
             'comp.sys.mac.hardware': 578,
             'comp.windows.x': 593,
             'misc.forsale': 585,
             'rec.autos': 594,
             'rec.motorcycles': 598,
             'rec.sport.baseball': 597,
             'rec.sport.hockey': 600,
             'sci.crypt': 595,
             'sci.electronics': 591,
             'sci.med': 594,
             'sci.space': 593,
             'soc.religion.christian': 599,
             'talk.politics.guns': 546,
             'talk.politics.mideast': 564,
             'talk.politics.misc': 465,
             'talk.religion.misc': 377})

#### 2、应用基本的分词方法

In [30]:
newsgroups_short_name = newsgroups.map(lambda x:(x[0].split('/')[-2], x[1]))

In [31]:
newsgroups_short_name.take(1)

[('talk.politics.guns',
  "From: cathyf@is.rice.edu (Catherine Anne Foulston)\nSubject: Re: WACO: Clinton press conference, part 1\nOrganization: Rice University\nLines: 6\n\nCould y'all PLEASE stop posting this stuff to tx.general.  tx.politics\nis sufficient and is where this stuff belongs.  Thanks.\n\n\tCathy\n-- \nCathy Foulston + Rice University + Network & Systems Support + cathyf@rice.edu\n")]

In [32]:
whiteSpaceSplit = newsgroups_short_name.flatMapValues(lambda x:[i.lower() for i in x.split(" ")])

In [33]:
whiteSpaceSplit.take(2)

[('talk.politics.guns', 'from:'), ('talk.politics.guns', 'cathyf@is.rice.edu')]

In [49]:
whiteSpaceSplit.map(lambda x:x[1]).distinct().count()

402978

* 直接输入4会进入markdown模式并打四个井号

#### 3、改进分词效果 

In [34]:
import re
s = "string. With. Punctuation?"
s = re.sub(r'[^\w\s]','',s)
s#去掉非字和非空格的符号

'string With Punctuation'

In [35]:
import re
s = "string. With. Punctuation?"
s = re.sub(r'[^\w]','',s)
s#去掉非字的符号

'stringWithPunctuation'

In [36]:
non_word_space = re.compile(r'[^\w\s]')

In [37]:
newsgroups_non_word_space = newsgroups_short_name.flatMapValues(lambda x:re.split(non_word_space, x)).mapValues(lambda x:x.lower())

In [38]:
newsgroups_non_word_space.take(10)

[('talk.politics.guns', 'from'),
 ('talk.politics.guns', ' cathyf'),
 ('talk.politics.guns', 'is'),
 ('talk.politics.guns', 'rice'),
 ('talk.politics.guns', 'edu '),
 ('talk.politics.guns', 'catherine anne foulston'),
 ('talk.politics.guns', '\nsubject'),
 ('talk.politics.guns', ' re'),
 ('talk.politics.guns', ' waco'),
 ('talk.politics.guns', ' clinton press conference')]

In [39]:
non_word_space = re.compile(r'\W+')

In [40]:
newsgroups_non_word_space = newsgroups_short_name.flatMapValues(lambda x:re.split(non_word_space, x)).mapValues(lambda x:x.lower())

In [41]:
newsgroups_non_word_space.take(110)

[('talk.politics.guns', 'from'),
 ('talk.politics.guns', 'cathyf'),
 ('talk.politics.guns', 'is'),
 ('talk.politics.guns', 'rice'),
 ('talk.politics.guns', 'edu'),
 ('talk.politics.guns', 'catherine'),
 ('talk.politics.guns', 'anne'),
 ('talk.politics.guns', 'foulston'),
 ('talk.politics.guns', 'subject'),
 ('talk.politics.guns', 're'),
 ('talk.politics.guns', 'waco'),
 ('talk.politics.guns', 'clinton'),
 ('talk.politics.guns', 'press'),
 ('talk.politics.guns', 'conference'),
 ('talk.politics.guns', 'part'),
 ('talk.politics.guns', '1'),
 ('talk.politics.guns', 'organization'),
 ('talk.politics.guns', 'rice'),
 ('talk.politics.guns', 'university'),
 ('talk.politics.guns', 'lines'),
 ('talk.politics.guns', '6'),
 ('talk.politics.guns', 'could'),
 ('talk.politics.guns', 'y'),
 ('talk.politics.guns', 'all'),
 ('talk.politics.guns', 'please'),
 ('talk.politics.guns', 'stop'),
 ('talk.politics.guns', 'posting'),
 ('talk.politics.guns', 'this'),
 ('talk.politics.guns', 'stuff'),
 ('talk.poli

 ('talk.politics.guns', 'edu'),
 ('talk.politics.guns', ''),
 ('talk.politics.guns', 'from'),
 两个文件之间产生的''

In [42]:
newsgroups_non_word_space.cache()
newsgroups_non_word_space.checkpoint()

In [114]:
newsgroups_non_word_space.map(lambda x:x[1]).distinct().count()

130126

In [43]:
contain_numeric = re.compile(r'[^0-9]*')

In [44]:
contain_numeric.fullmatch('a')

<_sre.SRE_Match object; span=(0, 1), match='a'>

In [45]:
newsgroups_non_word_space_numeric = newsgroups_non_word_space.filter(lambda x:contain_numeric.fullmatch(x[1]))

In [46]:
newsgroups_non_word_space_numeric.cache()

PythonRDD[28] at RDD at PythonRDD.scala:48

In [141]:
newsgroups_non_word_space_numeric.map(lambda x:x[1]).distinct().count()

84912

#### 4、移除停用词 

In [47]:
newsgroups_non_word_space_numeric.take(2)

[('talk.politics.guns', 'from'), ('talk.politics.guns', 'cathyf')]

In [48]:
word_count_sorted = newsgroups_non_word_space_numeric.map(lambda x:(x[1], 1)).reduceByKey(lambda x, y:x+y).sortBy(lambda x:x[1], False)

In [49]:
word_count_sorted.cache()

PythonRDD[41] at RDD at PythonRDD.scala:48

In [50]:
stop_words = [i for i, j in word_count_sorted.take(20)]

In [174]:
stop_words

['the',
 'to',
 'of',
 'a',
 'ax',
 'and',
 'i',
 'in',
 'is',
 'that',
 'it',
 'for',
 'you',
 'from',
 's',
 'edu',
 'on',
 'this',
 'be',
 't']

In [307]:
stop_words = ['the','a','an', 'to','of','and','in','is','that','it','for','from','on','this','be','or', 'by',
             'but', 'not', 'with', 'as', 'was', 'if', 'they', 'are', 'have', 'at', 'my'
             ]

In [308]:
sc.broadcast(stop_words)

<pyspark.broadcast.Broadcast at 0x7ff45844d3c8>

In [181]:
word_count_sorted.take(1)

[('the', 146532)]

In [309]:
word_count_sorted_filtered = word_count_sorted.filter(lambda x:x[0] not in stop_words)

In [310]:
word_count_sorted_filtered.cache()
word_count_sorted_filtered.checkpoint()

In [312]:
word_count_sorted_filtered.take(20)

[('ax', 62406),
 ('i', 53036),
 ('you', 26682),
 ('s', 22337),
 ('edu', 21321),
 ('t', 18728),
 ('m', 12756),
 ('subject', 12264),
 ('com', 12133),
 ('lines', 11835),
 ('can', 11355),
 ('', 11330),
 ('organization', 11233),
 ('re', 10534),
 ('what', 9861),
 ('there', 9689),
 ('x', 9332),
 ('all', 9310),
 ('will', 9279),
 ('we', 9227)]

#### 5、删除单长单词

In [313]:
word_count_sorted_filtered_rm_len1 = word_count_sorted_filtered.filter(lambda x:len(x[0])>=2 )

In [314]:
word_count_sorted_filtered_rm_len1.count()

84857

In [315]:
word_count_sorted_filtered_rm_len1.take(20)

[('ax', 62406),
 ('you', 26682),
 ('edu', 21321),
 ('subject', 12264),
 ('com', 12133),
 ('lines', 11835),
 ('can', 11355),
 ('organization', 11233),
 ('re', 10534),
 ('what', 9861),
 ('there', 9689),
 ('all', 9310),
 ('will', 9279),
 ('we', 9227),
 ('one', 9008),
 ('would', 8905),
 ('do', 8674),
 ('he', 8441),
 ('about', 8336),
 ('writes', 7844)]

####  6、基于频率去除单词（预先提取词干， 然后获得只在文档中出现过一次的单词）

In [338]:
stemmer = SnowballStemmer('english')

In [339]:
sc.broadcast(stemmer)

<pyspark.broadcast.Broadcast at 0x7ff45841c518>

In [340]:
bareWords = word_count_sorted_filtered_rm_len1.map(
    lambda x:(stemmer.stem(x[0]), x[1])
    ).reduceByKey(lambda x, y:x+y).filter(lambda x:x[1]<2)

In [341]:
bareWords.take(2)

[('suffuc', 1), ('shograph', 1)]

In [342]:
bareWords.cache()
bareWords.checkpoint()

In [343]:
bareWords.count()

26459

In [None]:
bareWords_dict = bareWords.collectAsMap()

In [349]:
bareWords_dict

{'suffuc': 1,
 'rxt': 1,
 'eopn': 1,
 'njszo': 1,
 'ogil': 1,
 'anastasio': 1,
 'tussad': 1,
 'boeckman': 1,
 'beecher': 1,
 'terrel': 1,
 'eqm': 1,
 'lisgollan': 1,
 'kiriazi': 1,
 'portat': 1,
 'peltz': 1,
 'slaughterbeck': 1,
 'zrchzrlk': 1,
 'vortec': 1,
 'uumo': 1,
 'shograph': 1,
 'mccw': 1,
 'jydc': 1,
 'nlbxo': 1,
 'mkfbq': 1,
 'happ': 1,
 'ela': 1,
 'azerinform': 1,
 'mmdf': 1,
 'lubkin': 1,
 'eastwick': 1,
 'ipz': 1,
 'keyser': 1,
 'kost': 1,
 'jouney': 1,
 'gospels_': 1,
 'brahma': 1,
 'charcel': 1,
 'reded': 1,
 '_gyuc': 1,
 'xjewel': 1,
 'xrhf': 1,
 'suncd': 1,
 'iwv': 1,
 'zoologist': 1,
 'lhoai': 1,
 'mj_nazi': 1,
 'dickman': 1,
 'awok': 1,
 'devprogram': 1,
 'mbr': 1,
 'cahng': 1,
 'inself': 1,
 'vstern': 1,
 'yet_': 1,
 'avetik': 1,
 'misrael': 1,
 'loukid': 1,
 'peterd': 1,
 'sedet': 1,
 'r_orq': 1,
 'zibsaq': 1,
 'checkbox': 1,
 'ninassup': 1,
 'sidiqui': 1,
 'umrlk': 1,
 'lanzerotti': 1,
 'mneh': 1,
 'witten': 1,
 'william_mosco': 1,
 'illan': 1,
 'espo': 1,
 'qzhjc

* _many_估计是方法，phone___________________________________________不知道哪里出问题 

In [344]:
sc.broadcast(bareWords_dict)

<pyspark.broadcast.Broadcast at 0x7ff458445518>

* 停用词、低频词会比较少，broadcast没关系；但是比较大的rdd.collect就非常地低效
* bareWords有26458个单词×平均8个字母大概不到1MB，broadcast将其作为字典的key，利用hash的功能而不会逐个比较

#### 7、提取词干 

In [58]:
SnowballStemmer('english').stem('like')

'like'

In [59]:
LancasterStemmer().stem('like')

'lik'

In [60]:
PorterStemmer().stem('like')

'like'

In [109]:
PorterStemmer().stem("we're")

"we'r"

In [110]:
PorterStemmer().stem("we's")

"we'"

In [427]:
PorterStemmer().stem("you")

'you'

In [428]:
PorterStemmer().stem("youth")

'youth'

#### 8、训练TF-IDF模型 

* 停用词、低频词之类的,这里靠的全部文档的统计完成的，因此需要先把所有的文档中的单词全局统计
* 停用词可以靠已有的corpus获取；但是低频词可能是特定文本中奇怪的表示造成的，属于根本不存在的单词，不能基于已有的corpus去查找；可以只根据确定的词典来选词，不在字典的一律认为是不合理的单词，单词本更新需要保证，避免流行词汇被直接排除出去，造成非常差的效果
* 提取词干在停用词、低频率单词删除之前比较合理，但影响应该不大

In [70]:
newsgroups.take(2)

[('file:/home/zh/Spark_ML_Experiment/20news-bydate/20news-bydate-train/talk.politics.guns/54722',
  "From: cathyf@is.rice.edu (Catherine Anne Foulston)\nSubject: Re: WACO: Clinton press conference, part 1\nOrganization: Rice University\nLines: 6\n\nCould y'all PLEASE stop posting this stuff to tx.general.  tx.politics\nis sufficient and is where this stuff belongs.  Thanks.\n\n\tCathy\n-- \nCathy Foulston + Rice University + Network & Systems Support + cathyf@rice.edu\n"),
 ('file:/home/zh/Spark_ML_Experiment/20news-bydate/20news-bydate-train/talk.politics.guns/54217',
  'From: Seth Adam Eliot <se08+@andrew.cmu.edu>\nSubject: Re: 2ND AMENDMENT DEAD - GOOD !\nOrganization: Doctoral student, Materials Science and Engineering, Carnegie Mellon, Pittsburgh, PA\nLines: 58\nNNTP-Posting-Host: po3.andrew.cmu.edu\nIn-Reply-To: <1993Apr18.001319.2340@gnv.ifas.ufl.edu>\n\nExcerpts from netnews.talk.politics.guns: 18-Apr-93 2ND AMENDMENT DEAD -\nGOOD ! by jrm@gnv.ifas.ufl.edu \n> Yea, there are mi

In [393]:
wordsByArticles = newsgroups.map(
    lambda x:(x[0].split('/')[-2:], x[1])
    ).flatMapValues(
    lambda x:re.split(non_word_space, x)
    ).mapValues(
    lambda x:x.lower()
    ).filter(
    lambda x:contain_numeric.fullmatch(x[1])
    ).map(
    lambda x:(x[0], stemmer.stem(x[1]))
    ).filter(
    lambda x:len(x[1])>=2 
    ).filter(
    lambda x:x[1] not in stop_words
    )

In [394]:
wordsByArticles.cache()
wordsByArticles.checkpoint()

In [395]:
wordsByArticles.take(10)

[(['talk.politics.guns', '54722'], 'cathyf'),
 (['talk.politics.guns', '54722'], 'rice'),
 (['talk.politics.guns', '54722'], 'edu'),
 (['talk.politics.guns', '54722'], 'catherin'),
 (['talk.politics.guns', '54722'], 'ann'),
 (['talk.politics.guns', '54722'], 'foulston'),
 (['talk.politics.guns', '54722'], 'subject'),
 (['talk.politics.guns', '54722'], 're'),
 (['talk.politics.guns', '54722'], 'waco'),
 (['talk.politics.guns', '54722'], 'clinton')]

In [485]:
wordsByArticles.count()

2329332

##### 过滤barewords 

In [396]:
wordsByArticles_rm_lowfrequency = wordsByArticles.mapValues(
    lambda x:(x, bareWords_dict.get(x, 0) )
    ).filter(lambda x:x[1][1]==0
    ).mapValues(lambda x:x[0])

In [397]:
wordsByArticles_rm_lowfrequency.cache()
wordsByArticles_rm_lowfrequency.checkpoint()

In [486]:
wordsByArticles_rm_lowfrequency.count()

2302877

In [487]:
wordsByArticles_rm_lowfrequency.take(2)

[(['talk.politics.guns', '54722'], 'cathyf'),
 (['talk.politics.guns', '54722'], 'rice')]

##### 文档词汇聚合 

In [490]:
def x_append(x, y):
    x.append(y)
    return x

In [491]:
wordsByArticles_rm_lowfrequency.take(2)

[(['talk.politics.guns', '54722'], 'cathyf'),
 (['talk.politics.guns', '54722'], 'rice')]

In [492]:
wordsByArticles_rm_lowfrequency_aggregate = wordsByArticles_rm_lowfrequency.map(
    lambda x:(x[0][0]+'/'+x[0][1], x[1])
    ).aggregateByKey(
    [],lambda x, y:x_append(x, y), lambda x, y:x+y
    )

In [493]:
wordsByArticles_rm_lowfrequency_aggregate.take(2)

[('talk.politics.misc/178957',
  ['muellerm',
   'vuse',
   'vanderbilt',
   'edu',
   'marc',
   'mueller',
   'subject',
   're',
   'pork',
   're',
   'abolish',
   'select',
   'servic',
   'nntp',
   'post',
   'host',
   'organ',
   'vanderbilt',
   'univers',
   'school',
   'engin',
   'nashvill',
   'tn',
   'usa',
   'line',
   'articl',
   'ra',
   'msstate',
   'edu',
   'ra',
   'msstate',
   'edu',
   'fletcher',
   'adam',
   'write',
   'muellerm',
   'vuse',
   'vanderbilt',
   'edu',
   'marc',
   'mueller',
   'write',
   'trumpet',
   'cc',
   'msstate',
   'edu',
   'fletcher',
   'adam',
   'write',
   'elimin',
   'transport',
   'wrong',
   'we',
   'need',
   'capabl',
   'sure',
   'has',
   'problem',
   'you',
   'read',
   'aviat',
   'week',
   'line',
   'can',
   'reopen',
   'would',
   'deliv',
   'year',
   'earlier',
   'cost',
   'billion',
   'less',
   'program',
   'polit',
   'though',
   'popular',
   'pork',
   'do',
   'read',
   'av',
   'w

In [499]:
wordsByArticles_rm_lowfrequency_aggregate.filter(lambda x: 'a' in x[1]).take(1)

[]

##### hashTF转换文档词汇 

In [500]:
hashingTF = HashingTF(numFeatures=26000)

In [501]:
sc.broadcast(hashingTF)

<pyspark.broadcast.Broadcast at 0x7ff466a8ff98>

In [504]:
tf = wordsByArticles_rm_lowfrequency_aggregate.map(lambda x:(x[0], hashingTF.transform(x[1])))

In [505]:
tf.take(1)

[('talk.politics.misc/178957',
  SparseVector(26000, {119: 2.0, 165: 1.0, 220: 2.0, 237: 1.0, 321: 2.0, 466: 2.0, 779: 2.0, 783: 1.0, 824: 2.0, 1181: 7.0, 1190: 1.0, 1651: 1.0, 1972: 1.0, 2093: 1.0, 2582: 1.0, 3547: 3.0, 3808: 1.0, 3838: 1.0, 4322: 1.0, 4486: 1.0, 4714: 1.0, 4730: 2.0, 4773: 2.0, 4871: 1.0, 4962: 1.0, 5118: 1.0, 5627: 1.0, 5846: 2.0, 5880: 1.0, 6176: 1.0, 6639: 1.0, 6646: 4.0, 7449: 1.0, 7451: 1.0, 7593: 1.0, 7667: 1.0, 7709: 1.0, 8065: 1.0, 8175: 1.0, 8297: 3.0, 8390: 1.0, 8549: 2.0, 8708: 1.0, 8916: 1.0, 8935: 1.0, 8945: 3.0, 9036: 1.0, 9304: 5.0, 9387: 1.0, 9566: 1.0, 9614: 3.0, 9732: 1.0, 9860: 1.0, 10024: 2.0, 10064: 3.0, 10399: 1.0, 10447: 1.0, 10575: 1.0, 10585: 1.0, 11050: 1.0, 11151: 1.0, 11190: 2.0, 11292: 2.0, 11486: 1.0, 11529: 2.0, 11559: 1.0, 11676: 2.0, 11792: 1.0, 12008: 3.0, 12126: 1.0, 12282: 3.0, 12370: 1.0, 12449: 1.0, 12530: 2.0, 12553: 1.0, 12726: 3.0, 13191: 1.0, 13285: 1.0, 13978: 2.0, 14165: 1.0, 14260: 1.0, 14321: 2.0, 14411: 1.0, 14612: 1.0, 

##### 用idf去fit全部文档的词频 

In [508]:
labels = tf.map(lambda x:x[0])

docs = tf.map(lambda x:x[1])

In [510]:
idf = IDF().fit(docs)

In [511]:
tf_idf = idf.transform(docs)

##### zip文档名称和文档tf-idf

In [512]:
tf_idf_ziped = labels.zip(tf_idf)

In [513]:
tf_idf_ziped.take(1)

[('talk.politics.misc/178957',
  SparseVector(26000, {119: 12.5787, 165: 4.4976, 220: 3.5135, 237: 4.0356, 321: 2.6312, 466: 10.8437, 779: 8.5939, 783: 3.0648, 824: 3.3019, 1181: 42.2663, 1190: 1.2097, 1651: 1.7415, 1972: 2.0015, 2093: 3.0648, 2582: 4.7592, 3547: 3.1763, 3808: 3.0995, 3838: 3.3251, 4322: 1.3299, 4486: 2.745, 4714: 1.1745, 4730: 0.0057, 4773: 14.7759, 4871: 1.8535, 4962: 2.4659, 5118: 6.5007, 5627: 1.4846, 5846: 7.7316, 5880: 0.8474, 6176: 1.5756, 6639: 3.5079, 6646: 8.0667, 7449: 2.6089, 7451: 4.3851, 7593: 3.4285, 7667: 3.1073, 7709: 4.4511, 8065: 3.7318, 8175: 0.9241, 8297: 8.5754, 8390: 2.4251, 8549: 6.164, 8708: 2.6077, 8916: 4.1028, 8935: 1.8213, 8945: 8.7662, 9036: 1.4842, 9304: 2.1273, 9387: 0.6582, 9566: 2.8538, 9614: 3.7964, 9732: 3.982, 9860: 4.8341, 10024: 3.6934, 10064: 16.6491, 10399: 2.5315, 10447: 1.5841, 10575: 2.7165, 10585: 3.3963, 11050: 3.6038, 11151: 3.3755, 11190: 5.2105, 11292: 12.0761, 11486: 2.8898, 11529: 10.4791, 11559: 4.1465, 11676: 11.5007

In [408]:
tf_idf_ziped.take(1)

[('talk.politics.misc/178957',
  SparseVector(26000, {119: 12.5787, 165: 4.4976, 220: 3.5135, 237: 4.0356, 321: 2.6312, 466: 10.8437, 779: 8.5939, 783: 3.0648, 824: 3.3019, 1181: 42.2663, 1190: 1.2097, 1651: 1.7415, 1972: 2.0015, 2093: 3.0648, 2582: 4.7592, 3547: 3.1763, 3808: 3.0995, 3838: 3.3251, 4322: 1.3299, 4486: 2.745, 4714: 1.1745, 4730: 0.0057, 4773: 14.7759, 4871: 1.8535, 4962: 2.4659, 5118: 6.5007, 5627: 1.4846, 5846: 7.7316, 5880: 0.8474, 6176: 1.5756, 6639: 3.5079, 6646: 8.0667, 7449: 2.6089, 7451: 4.3851, 7593: 3.4285, 7667: 3.1073, 7709: 4.4511, 8065: 3.7318, 8175: 0.9241, 8297: 8.5754, 8390: 2.4251, 8549: 6.164, 8708: 2.6077, 8916: 4.1028, 8935: 1.8213, 8945: 8.7662, 9036: 1.4842, 9304: 2.1273, 9387: 0.6582, 9566: 2.8538, 9614: 3.7964, 9732: 3.982, 9860: 4.8341, 10024: 3.6934, 10064: 16.6491, 10399: 2.5315, 10447: 1.5841, 10575: 2.7165, 10585: 3.3963, 11050: 3.6038, 11151: 3.3755, 11190: 5.2105, 11292: 12.0761, 11486: 2.8898, 11529: 10.4791, 11559: 4.1465, 11676: 11.5007

In [514]:
tf_idf_ziped.count()

11314

In [410]:
# tf_idf_ziped.saveAsTextFile('hdfs://zh:9000/tf_idf_ziped', )
# tf_idf_ziped.saveAsTextFile('file:///home/zh/Spark_ML_Experiment/tf_idf_ziped', )
# 存储以text的形式，读取全都成了str，失败

##### 使用pickleFile 

In [411]:
# ss = sc.parallelize([((1,2,2), enumerate([1,2,2,])), ((1,3), enumerate([1,1 ]))])

# ss.take(1)

# ss.saveAsPickleFile('hdfs://zh:9000/save_test')

# ss = sc.pickleFile('hdfs://zh:9000/save_test')

# ss.take(1)

In [413]:
tf_idf_ziped.saveAsPickleFile('hdfs://zh:9000/TF_IDF_ByKey')

In [414]:
tf_idf_ziped.saveAsPickleFile('file:///home/zh/Spark_ML_Experiment/TF_IDF_ByKey')

* pickle存储下来

#### 9、分析TF-IDF权重 

In [415]:
tf_idf_ziped = sc.pickleFile('hdfs://zh:9000/TF_IDF_ByKey', )

In [416]:
tf_idf_ziped.take(1)

[('talk.politics.misc/178957',
  SparseVector(26000, {119: 12.5787, 165: 4.4976, 220: 3.5135, 237: 4.0356, 321: 2.6312, 466: 10.8437, 779: 8.5939, 783: 3.0648, 824: 3.3019, 1181: 42.2663, 1190: 1.2097, 1651: 1.7415, 1972: 2.0015, 2093: 3.0648, 2582: 4.7592, 3547: 3.1763, 3808: 3.0995, 3838: 3.3251, 4322: 1.3299, 4486: 2.745, 4714: 1.1745, 4730: 0.0057, 4773: 14.7759, 4871: 1.8535, 4962: 2.4659, 5118: 6.5007, 5627: 1.4846, 5846: 7.7316, 5880: 0.8474, 6176: 1.5756, 6639: 3.5079, 6646: 8.0667, 7449: 2.6089, 7451: 4.3851, 7593: 3.4285, 7667: 3.1073, 7709: 4.4511, 8065: 3.7318, 8175: 0.9241, 8297: 8.5754, 8390: 2.4251, 8549: 6.164, 8708: 2.6077, 8916: 4.1028, 8935: 1.8213, 8945: 8.7662, 9036: 1.4842, 9304: 2.1273, 9387: 0.6582, 9566: 2.8538, 9614: 3.7964, 9732: 3.982, 9860: 4.8341, 10024: 3.6934, 10064: 16.6491, 10399: 2.5315, 10447: 1.5841, 10575: 2.7165, 10585: 3.3963, 11050: 3.6038, 11151: 3.3755, 11190: 5.2105, 11292: 12.0761, 11486: 2.8898, 11529: 10.4791, 11559: 4.1465, 11676: 11.5007

In [417]:
max_min_tf_idf = tf_idf_ziped.mapValues(lambda x:(min(x), max(x)))

In [418]:
max_min_tf_idf.take(2)

[('talk.politics.misc/178957', (0.0, 42.26633384444316))]

In [419]:
max_min_tf_idf.cache()

PythonRDD[614] at RDD at PythonRDD.scala:48

In [570]:
# max_tf_idf = max_min_tf_idf.map(lambda x:x[1][1]).reduce(lambda x, y:max(x, y))
# max_tf_idf
# 很慢

In [515]:
common_not_stop = sc.parallelize([['you', 'we', 'do', 'am', ], ])
idf.transform(hashingTF.transform(common_not_stop)).take(2)

[SparseVector(26000, {4322: 1.3299, 11706: 1.6028, 14321: 0.5457, 18916: 0.9272})]

In [516]:
common_stop = sc.parallelize([stop_words, ])
idf.transform(hashingTF.transform(common_stop)).take(2)

[SparseVector(26000, {1080: 7.388, 1870: 6.2428, 2695: 9.3339, 4287: 6.4435, 5495: 7.9476, 5800: 7.388, 8242: 7.2544, 8770: 5.3085, 9696: 7.7244, 12777: 7.5421, 14373: 6.1558, 15003: 6.3382, 15140: 7.5421, 15898: 8.2353, 16043: 6.936, 18300: 7.7244, 19224: 7.388, 19583: 6.4435, 20052: 5.9327, 20365: 9.3339, 21101: 7.2544, 21119: 3.6884, 21441: 7.9476, 23069: 6.936, 23317: 8.6407, 23903: 7.9476, 24827: 9.3339, 25289: 5.8999})]

In [517]:
word_not_exist = sc.parallelize([['a', 'g', 's', 'j', ], ])
idf.transform(hashingTF.transform(word_not_exist)).take(2)

[SparseVector(26000, {11264: 7.388, 12506: 5.4021, 21568: 9.3339, 25289: 5.8999})]

* 可能有哈希冲突,导致不存在的词和stopwords的tf-idf较高。

In [518]:
uncommon = sc.parallelize([['legislation', 'investment', 'telescope', 'json', 'fancy', 'python', 'spark'], ])

In [519]:
idf.transform(hashingTF.transform(uncommon)).take(2)

[SparseVector(26000, {2015: 7.7244, 5051: 7.9476, 5649: 6.0017, 17120: 6.038, 17276: 9.3339, 22510: 7.0313, 24212: 9.3339})]

In [520]:
hockey_vectors_f2

[SparseVector(26000, {64: 5.8374, 100: 4.0995, 216: 10.5469, 220: 3.5135, 332: 5.72, 422: 3.1436, 440: 3.7617, 1078: 1.8213, 1127: 5.3636, 1262: 9.8061, 1631: 3.1256, 1815: 1.6952, 1903: 6.6257, 2093: 3.0648, 2283: 3.207, 2495: 13.872, 2722: 2.4392, 2808: 7.9254, 2910: 4.3758, 3219: 23.1195, 3499: 0.9092, 3621: 2.3128, 3639: 3.1518, 4730: 0.0057, 4902: 5.8839, 4962: 2.4659, 5606: 3.7208, 5611: 3.8871, 5627: 1.4846, 5629: 2.4372, 5772: 10.4135, 5880: 0.8474, 6260: 4.0077, 6318: 5.6963, 6322: 3.2655, 6455: 1.5717, 6728: 12.8142, 6986: 3.2771, 7185: 3.0442, 7331: 1.744, 7580: 5.3449, 7627: 6.9144, 7667: 3.1073, 7727: 2.554, 8057: 1.5872, 8172: 10.6531, 8175: 2.7722, 8185: 1.5159, 8226: 15.4489, 8301: 15.769, 8408: 4.3877, 8487: 3.0446, 8564: 2.9589, 8711: 5.5052, 8820: 1.8631, 8824: 5.6985, 9357: 3.4561, 9387: 1.3163, 9466: 10.8041, 9498: 14.4691, 9614: 2.531, 9740: 1.0706, 9766: 4.2905, 10144: 3.4411, 10260: 19.3503, 10353: 3.2818, 10470: 2.5449, 10497: 2.1335, 10509: 3.6435, 10710: 4.14

### 使用TF-IDF模型 

* TF-IDF向量基于“词出现”和“词-文档的紧密程度”，能够成为衡量相似度的标志。特定词和两个文档的紧密程度都很高，说明两个文档很相似  
* 词-文档的紧密程度 ==> 总体出现的文档少并且文档中出现的次数多

#### 1、20Newsgroups 数据集的文本相似度和TF-IDF特征

In [521]:
'ss' in 'assdrf'

True

In [522]:
sparse_sample = Vectors.sparse(100, {1:234, 2:3242})

In [523]:
sparse_sample.dot

<bound method SparseVector.dot of SparseVector(100, {1: 234.0, 2: 3242.0})>

In [524]:
def sin_sparser_vectors(A, B):
    return A.dot(B)/(math.pow(A.dot(A), 0.5)*math.pow(B.dot(B), 0.5))

In [574]:
hockey_vectors_f2 = tf_idf_ziped.filter(lambda x: 'hockey' in x[0]).map(lambda x:x[1]).take(2)

In [575]:
graphics_vectors_f2 = tf_idf_ziped.filter(lambda x: 'comp.graphics' in x[0]).map(lambda x:x[1]).take(2)

In [576]:
baseball_vectors_f2 = tf_idf_ziped.filter(lambda x: 'baseball' in x[0]).map(lambda x:x[1]).take(2)

In [577]:
mideast_vectors_f2 = tf_idf_ziped.filter(lambda x: 'mideast' in x[0]).map(lambda x:x[1]).take(2)

In [578]:
guns_vectors_f2 = tf_idf_ziped.filter(lambda x: 'guns' in x[0]).map(lambda x:x[1]).take(2)

In [579]:
def _four_result(a, b, _func):
    _list = [(i,i==1) for i in [0, 1]] + [(i,i==0) for i in [0, 1]]
    return [_func(a[i], b[j]) for i, j in _list], np.mean([_func(a[i], b[j]) for i, j in _list])

In [580]:
_four_result(hockey_vectors_f2, graphics_vectors_f2, sin_sparser_vectors)

([0.024869347842747889,
  0.015500642203138706,
  0.020516871539784143,
  0.016427577609365054],
 0.019328609798758948)

In [581]:
_four_result(hockey_vectors_f2, baseball_vectors_f2, sin_sparser_vectors)

([0.051320290941893644,
  0.00043835791784942592,
  0.062988578439905102,
  0.0057253367655224555],
 0.030118141016292656)

In [533]:
_four_result(graphics_vectors_f2, baseball_vectors_f2, sin_sparser_vectors)

([0.0069106511365626721,
  7.6418629066606484e-07,
  0.0079231986924085222,
  0.00069995969467140633],
 0.003883643427483317)

In [534]:
_four_result(graphics_vectors_f2, guns_vectors_f2, sin_sparser_vectors)

([0.018187981784742898,
  0.015197643387922392,
  0.01646622077718006,
  0.030017562728845454],
 0.0199673521696727)

In [535]:
_four_result(baseball_vectors_f2, baseball_vectors_f2, sin_sparser_vectors)

([1.0, 1.0000000000000002, 0.033874742476862063, 0.033874742476862063],
 0.51693737123843109)

In [537]:
_four_result(hockey_vectors_f2, hockey_vectors_f2, sin_sparser_vectors)

([1.0000000000000002,
  1.0000000000000002,
  0.032947772601927335,
  0.032947772601927335],
 0.5164738863009638)

In [536]:
_four_result(mideast_vectors_f2, baseball_vectors_f2, sin_sparser_vectors)

([0.0010546136869315258,
  0.047770096671491305,
  0.0026129931185484268,
  0.0053129163921758525],
 0.014187654967286779)

In [582]:
_four_result(mideast_vectors_f2, graphics_vectors_f2, sin_sparser_vectors)

([0.011342715045459147,
  0.024822413052300381,
  0.0051595465832042714,
  0.014634210379319449],
 0.013989721265070813)

#### 2、基于20Newsgroups使用TF-IDF训练文本分类器

In [539]:
regex_dot_sep = re.compile(r'[./]')

In [540]:
re.split(regex_dot_sep,  'asd/asd.dsas')

['asd', 'asd', 'dsas']

In [541]:
topics_vectors = tf_idf_ziped.map(lambda x:(re.split(regex_dot_sep, x[0])[1], x[1]))

In [542]:
topics_vectors.take(10)

[('politics',
  SparseVector(26000, {119: 12.5787, 165: 4.4976, 220: 3.5135, 237: 4.0356, 321: 2.6312, 466: 10.8437, 779: 8.5939, 783: 3.0648, 824: 3.3019, 1181: 42.2663, 1190: 1.2097, 1651: 1.7415, 1972: 2.0015, 2093: 3.0648, 2582: 4.7592, 3547: 3.1763, 3808: 3.0995, 3838: 3.3251, 4322: 1.3299, 4486: 2.745, 4714: 1.1745, 4730: 0.0057, 4773: 14.7759, 4871: 1.8535, 4962: 2.4659, 5118: 6.5007, 5627: 1.4846, 5846: 7.7316, 5880: 0.8474, 6176: 1.5756, 6639: 3.5079, 6646: 8.0667, 7449: 2.6089, 7451: 4.3851, 7593: 3.4285, 7667: 3.1073, 7709: 4.4511, 8065: 3.7318, 8175: 0.9241, 8297: 8.5754, 8390: 2.4251, 8549: 6.164, 8708: 2.6077, 8916: 4.1028, 8935: 1.8213, 8945: 8.7662, 9036: 1.4842, 9304: 2.1273, 9387: 0.6582, 9566: 2.8538, 9614: 3.7964, 9732: 3.982, 9860: 4.8341, 10024: 3.6934, 10064: 16.6491, 10399: 2.5315, 10447: 1.5841, 10575: 2.7165, 10585: 3.3963, 11050: 3.6038, 11151: 3.3755, 11190: 5.2105, 11292: 12.0761, 11486: 2.8898, 11529: 10.4791, 11559: 4.1465, 11676: 11.5007, 11792: 3.0185, 

In [543]:
tf_idf_ziped.map(lambda x:x[0]).take(100)

['talk.politics.misc/178957',
 'alt.atheism/53199',
 'talk.politics.misc/176973',
 'talk.politics.misc/178320',
 'soc.religion.christian/20740',
 'comp.os.ms-windows.misc/9654',
 'sci.space/61128',
 'rec.sport.hockey/53555',
 'soc.religion.christian/20975',
 'comp.sys.mac.hardware/51637',
 'talk.politics.mideast/76277',
 'sci.space/61134',
 'talk.politics.misc/178464',
 'rec.autos/101592',
 'comp.os.ms-windows.misc/9828',
 'sci.space/60985',
 'comp.graphics/38694',
 'talk.politics.guns/54316',
 'sci.space/60188',
 'comp.windows.x/67111',
 'rec.sport.baseball/104577',
 'alt.atheism/53660',
 'comp.windows.x/67374',
 'rec.sport.hockey/53529',
 'soc.religion.christian/20502',
 'sci.med/58769',
 'sci.electronics/53529',
 'rec.sport.hockey/53540',
 'sci.crypt/15402',
 'rec.motorcycles/103198',
 'sci.electronics/53636',
 'sci.crypt/15707',
 'comp.windows.x/67306',
 'rec.motorcycles/104669',
 'talk.politics.mideast/76183',
 'rec.sport.baseball/104928',
 'sci.space/60847',
 'sci.crypt/15397',
 

##### 给类编号 

In [544]:
topics_20 = topics_vectors.map(lambda x:x[0]).distinct().take(40)

In [545]:
topics_20_dict = {j:i for i, j in enumerate(topics_20)}
topics_20_dict

{'atheism': 7,
 'autos': 11,
 'crypt': 0,
 'electronics': 10,
 'forsale': 5,
 'graphics': 9,
 'med': 6,
 'motorcycles': 8,
 'os': 3,
 'politics': 4,
 'religion': 2,
 'space': 1,
 'sport': 12,
 'sys': 14,
 'windows': 13}

In [546]:
train_data = topics_vectors.map(lambda x:LabeledPoint( topics_20_dict.get(x[0]), x[1]))

In [547]:
train_data.cache()
train_data.checkpoint()

In [548]:
NaiveBayesModel = NaiveBayes.train(train_data, lambda_=0.1)

##### 载入测试文件 

In [549]:
newsgroups_test = sc.wholeTextFiles('file:///home/zh/Spark_ML_Experiment/20news-bydate/20news-bydate-test/*')

In [550]:
tf_test = newsgroups_test.map(
    lambda x:(x[0].split('/')[-2:], x[1])
    ).flatMapValues(
    lambda x:re.split(non_word_space, x)
    ).mapValues(
    lambda x:x.lower()
    ).filter(
    lambda x:contain_numeric.fullmatch(x[1])
    ).map(
    lambda x:(x[0], stemmer.stem(x[1]))
    ).filter(
    lambda x:x[1] not in stop_words
    ).filter(
    lambda x:len(x[1])>=2 
    ).mapValues(
    lambda x:(x, bareWords_dict.get(x, 0) )
    ).filter(
    lambda x:x[1][1]==0
    ).mapValues(
    lambda x:x[0]
    ).map(
    lambda x:(x[0][0]+'/'+x[0][1], x[1])
    ).aggregateByKey(
    [],lambda x, y:x_append(x, y), lambda x, y:x+y
    ).map(
    lambda x:(x[0], hashingTF.transform(x[1]))
    )

In [551]:
tf_test.cache()
tf_test.checkpoint()

In [552]:
tf_test.take(1)

[('sci.med/59506',
  SparseVector(26000, {204: 1.0, 1672: 1.0, 1967: 1.0, 2539: 1.0, 4730: 1.0, 5117: 1.0, 5880: 1.0, 6488: 1.0, 7439: 1.0, 7664: 1.0, 9281: 1.0, 9304: 1.0, 9387: 1.0, 10019: 1.0, 10056: 1.0, 10600: 1.0, 11986: 1.0, 13729: 1.0, 13934: 2.0, 15560: 1.0, 17649: 1.0, 17656: 1.0, 22590: 1.0, 23397: 1.0, 23460: 1.0, 24470: 1.0}))]

In [553]:
tf_test.count()

7532

In [554]:
idf_test = IDF().fit(tf_test.map(lambda x:x[1]))

In [555]:
tf_test_fields = tf_test.map(lambda x:x[0])

In [556]:
tf_test_transformed = idf_test.transform(tf_test.map(lambda x:x[1]))

In [557]:
tf_idf_ziped_test = tf_test_fields.zip(tf_test_transformed)  

In [558]:
test_ = tf_idf_ziped_test.map(
    lambda x:(re.split(regex_dot_sep, x[0])[1], x[1])
    ).map(
    lambda x:LabeledPoint( topics_20_dict.get(x[0]), x[1])
    )

In [559]:
test_.take(1)

[LabeledPoint(6.0, (26000,[204,1672,1967,2539,4730,5117,5880,6488,7439,7664,9281,9304,9387,10019,10056,10600,11986,13729,13934,15560,17649,17656,22590,23397,23460,24470],[5.74899481748,7.1352891786,2.49571756589,7.31761073539,0.00239234563862,4.91971546259,0.820534131571,4.19966082911,4.27308829767,4.70754094265,8.23390146727,0.405066939678,0.652946645533,3.40558772996,2.47185008449,4.26360955371,3.67477521978,3.174476009,15.6568727183,0.876664194759,1.65604010954,0.0,7.54075428671,0.0366380958514,6.21899844672,0.831145114865]))]

In [560]:
test_label = test_.map(lambda x:x.label)
test_features = test_.map(lambda x:x.features)

In [561]:
predict = NaiveBayesModel.predict(test_features)

In [562]:
predict.count()

7532

In [563]:
predict_label_zip = predict.zip(test_label).map(lambda x:(float(x[0]), float(x[1])))

In [569]:
predict_label_zip.take(10)

[(6.0, 6.0),
 (2.0, 2.0),
 (1.0, 1.0),
 (12.0, 12.0),
 (4.0, 4.0),
 (14.0, 5.0),
 (6.0, 6.0),
 (14.0, 14.0),
 (14.0, 3.0),
 (4.0, 2.0)]

In [565]:
MultiClassiEva =  MulticlassMetrics(predict_label_zip)

In [566]:
MultiClassiEva.accuracy

0.7866436537440255

In [567]:
MultiClassiEva.weightedFMeasure()

0.7743397989051573

* 以上两个数值比书中的略少，可能原因是采用了stemming。

### Word2Vec模型

#### 1、基于20Newsgroups数据集训练Word2Vec 

In [589]:
labels = wordsByArticles_rm_lowfrequency_aggregate.map(lambda x:x[0])

docs = wordsByArticles_rm_lowfrequency_aggregate.map(lambda x:x[1])

In [590]:
docs.take(1)

[['muellerm',
  'vuse',
  'vanderbilt',
  'edu',
  'marc',
  'mueller',
  'subject',
  're',
  'pork',
  're',
  'abolish',
  'select',
  'servic',
  'nntp',
  'post',
  'host',
  'organ',
  'vanderbilt',
  'univers',
  'school',
  'engin',
  'nashvill',
  'tn',
  'usa',
  'line',
  'articl',
  'ra',
  'msstate',
  'edu',
  'ra',
  'msstate',
  'edu',
  'fletcher',
  'adam',
  'write',
  'muellerm',
  'vuse',
  'vanderbilt',
  'edu',
  'marc',
  'mueller',
  'write',
  'trumpet',
  'cc',
  'msstate',
  'edu',
  'fletcher',
  'adam',
  'write',
  'elimin',
  'transport',
  'wrong',
  'we',
  'need',
  'capabl',
  'sure',
  'has',
  'problem',
  'you',
  'read',
  'aviat',
  'week',
  'line',
  'can',
  'reopen',
  'would',
  'deliv',
  'year',
  'earlier',
  'cost',
  'billion',
  'less',
  'program',
  'polit',
  'though',
  'popular',
  'pork',
  'do',
  'read',
  'av',
  'week',
  'don',
  'rememb',
  'could',
  'you',
  'suppli',
  'date',
  'magazin',
  'aviat',
  'week',
  'march'

In [591]:
Word2VecModel = Word2Vec().fit(docs)

In [595]:
# Word2VecModel.findSynonyms('Word2VecModel', 20) 报错

In [597]:
list(Word2VecModel.findSynonyms('hockey', 20) )

[('playoff', 0.74765855073928833),
 ('team', 0.74734270572662354),
 ('leagu', 0.738808274269104),
 ('nhl', 0.72746217250823975),
 ('basebal', 0.71577584743499756),
 ('espn', 0.70459455251693726),
 ('player', 0.70285177230834961),
 ('play', 0.69389545917510986),
 ('coach', 0.69043582677841187),
 ('ncaa', 0.68854153156280518),
 ('roster', 0.68808853626251221),
 ('tournament', 0.68377602100372314),
 ('finnish', 0.67263495922088623),
 ('fan', 0.66924643516540527),
 ('brave', 0.66703492403030396),
 ('yanke', 0.66160780191421509),
 ('award', 0.66098582744598389),
 ('canadien', 0.65468335151672363),
 ('montreal', 0.65365475416183472),
 ('season', 0.65193116664886475)]

In [601]:
list(Word2VecModel.findSynonyms('legisl', 20) )

[('enact', 0.77955901622772217),
 ('subcommitte', 0.76623046398162842),
 ('feder', 0.75973087549209595),
 ('mandatori', 0.75446760654449463),
 ('chafe', 0.74882221221923828),
 ('manslaught', 0.74765241146087646),
 ('ownership', 0.74207735061645508),
 ('semiautomat', 0.73753988742828369),
 ('discretionari', 0.7356451153755188),
 ('banker', 0.73050183057785034),
 ('prohibit', 0.72958952188491821),
 ('repeal', 0.72452002763748169),
 ('endors', 0.72298043966293335),
 ('moynihan', 0.71978044509887695),
 ('conduct', 0.71869361400604248),
 ('pursuant', 0.71502089500427246),
 ('bradi', 0.71234768629074097),
 ('commiss', 0.70534729957580566),
 ('deleg', 0.70498549938201904),
 ('surveil', 0.70306593179702759)]

* 提取词根导致单词被截断，返回原形或许是好的选择。

### 总结 

** 1、TF-IDF **           
** 2、Word2Vec **