# 朴素贝叶斯用于imdb文本分类
https://www.kaggle.com/c/word2vec-nlp-tutorial

## 数据导入

In [15]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords # Import the stop word list
import re
from bs4 import BeautifulSoup             

In [2]:
file_train='/home/wwww/Datasets/imdb/labeledTrainData.tsv'


In [3]:
train = pd.read_csv(file_train, header=0, \
                    delimiter="\t", quoting=3)

In [4]:
train.shape

(25000, 3)

In [5]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [6]:
file_test='/home/wwww/Datasets/imdb/testData.tsv'
test=pd.read_csv(file_test,sep='\t',quoting=3)

In [7]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [8]:
train['type']='tr'
test['type']='te'
data=pd.concat([train,test],axis=0)
data.head()

Unnamed: 0,id,sentiment,review,type
0,"""5814_8""",1.0,"""With all this stuff going down at the moment ...",tr
1,"""2381_9""",1.0,"""\""The Classic War of the Worlds\"" by Timothy ...",tr
2,"""7759_3""",0.0,"""The film starts with a manager (Nicholas Bell...",tr
3,"""3630_4""",0.0,"""It must be assumed that those who praised thi...",tr
4,"""9495_8""",1.0,"""Superbly trashy and wondrously unpretentious ...",tr


## 文本数据处理

In [10]:
def review_to_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    # 2. Remove non-letters       
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                     
    # Remove stopwords
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]   
    # 6. Join the words back into one string separated by space and return the result.
    return( " ".join( meaningful_words ))   

In [13]:
def text_preprocess(data):
    '''
    文本数据预处理
    return：清理完成的string类型列表
    '''
    num_reviews = data["review"].size
    clean_reviews = []
    for i in range( 0, num_reviews ):
        # If the index is evenly divisible by 1000, print a message
        if( (i+1)%1000 == 0 ):
            print("Review %d of %d\n" % ( i+1, num_reviews )  )                                                                  
        clean_reviews.append( review_to_words( data["review"][i] ))
    return clean_reviews

## Sklearn bag-of-words模型

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [16]:
clean_train_reviews=text_preprocess(train)
clean_test_reviews=text_preprocess(test)

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review

In [24]:
#bag  of words
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [25]:
train_data_features.shape

(25000, 5000)

In [26]:
#bag  of words
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
test_data_features.shape

(25000, 5000)

In [27]:
vocab = vectorizer.get_feature_names()


## Baseline 随机森林

In [42]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["sentiment"] )

### 随机森林算法提交结果

In [45]:
clean_test_reviews = [] 

for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



In [46]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1


In [47]:
output.to_csv( "Bag_of_Words_RF_model.csv", index=False, quoting=3 )

## navie bayes 测试

In [29]:
from text_nbc import TextNBClassifier

In [30]:
nbc=TextNBClassifier()

In [31]:
nbc.fit(train_data_features,train["sentiment"] )

<text_nbc.TextNBClassifier at 0x7f7409b84820>

In [32]:
result=nbc.predict(test_data_features)

In [34]:
result.shape

(25000,)

In [37]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":1-result} )
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",0
3,"""7186_2""",1
4,"""12128_7""",1


In [38]:
output.to_csv( "Bag_of_Words_NB_model.csv", index=False, quoting=3 )

## nb代码测试

In [167]:
X=train_data_features 
y=train["sentiment"]
# 简化成词语是否出现
X= np.minimum(X, 1)

array([2, 3, 2, ..., 2, 3, 3])

In [168]:
X_0 = X[y == 0, :] 
X_1 = X[y == 1, :]

In [146]:
X_0.shape

(12500, 5000)

In [147]:
X_1.shape

(12500, 5000)

In [148]:
X_0.sum(axis=0)

array([ 99,  38,  55, ..., 510, 408,  73])

In [169]:
parameters = {
            "likelihod": None,  # shape: (2, M) 2：2个分类，M: Vacb size 词典大小
            "prior": None,  # shape: (1)
        }
P=parameters

In [170]:
N, M = X.shape
P["likelihod"] = np.zeros((2, M))
#训练集非常两类，y=0和y=1 单独计算参数
X_0 = X[y == 0, :] 
X_1 = X[y == 1, :]
P["prior"] = X_1.shape[0] / N #prior 即为y=1的概率

In [171]:
P["prior"] 

0.5

In [174]:
#debug
P["likelihod"][0,2912]

0.6615741481362982

In [None]:
np.sum(X_0[:,2912])

24956

In [158]:
t1=np.sum(X_0,axis=0)+1

In [159]:
t1[2912]

24957

In [173]:
P["likelihod"][0,:]=(np.sum(X_0,axis=0)+1)/(X_0.shape[0]+2)
P["likelihod"][1,:]=(np.sum(X_1,axis=0)+1)/(X_1.shape[0]+2)

In [175]:
labels=[0,1]

In [188]:
#predict
X=test_data_features 
X= np.minimum(X, 1)

In [113]:
P["likelihod"][0,:].shape

(5000,)

In [114]:
X.shape

(25000, 5000)

In [80]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [81]:
X*P["likelihod"][0,:]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [189]:
tmp=(1-X)*(1-P["likelihod"][0,:])+X*P["likelihod"][0,:]  

In [120]:
tmp

array([[0.99200128, 0.9968805 , 0.99552072, ..., 0.95912654, 0.96728523,
        0.99408095],
       [0.99200128, 0.9968805 , 0.99552072, ..., 0.95912654, 0.96728523,
        0.99408095],
       [0.99200128, 0.9968805 , 0.99552072, ..., 0.95912654, 0.96728523,
        0.99408095],
       ...,
       [0.99200128, 0.9968805 , 0.99552072, ..., 0.95912654, 0.96728523,
        0.99408095],
       [0.99200128, 0.9968805 , 0.99552072, ..., 0.95912654, 0.96728523,
        0.99408095],
       [0.99200128, 0.9968805 , 0.99552072, ..., 0.95912654, 0.96728523,
        0.99408095]])

In [178]:
prior = P["prior"]

In [95]:
tmp.shape

(25000, 5000)

In [123]:
X[2384,1381]

1

In [136]:
for idx in range(5000):
    print (idx,np.log(tmp[0][idx]))

0 -0.00803088158193438
1 -0.0031243766653808185
2 -0.00448934536259444
3 -0.019220489940107652
4 -0.04365221027749922
5 -0.0024024997541451745
6 -0.004569695690065297
7 -0.00400737893025892
8 -0.016045496431588747
9 -0.06981021755846631
10 -0.01824249475213084
11 -0.008192158898629384
12 -0.00328486454889008
13 -0.006983212549490653
14 -0.00843412365361079
15 -0.02469861164851944
16 -0.011099607239750686
17 -0.009887142959947368
18 -0.0064195373725093804
19 -0.00553440353882008
20 -0.00384677505936034
21 -0.012961579038574074
22 -0.009079649376018798
23 -0.0031243766653808185
24 -0.0034453781928943973
25 -0.012313543373935308
26 -0.007547205635382966
27 -0.0028034781467826593
28 -0.008595466022466731
29 -0.004168008598904009
30 -0.006580555004287884
31 -0.0035256446776668364
32 -0.002562871817675574
33 -0.0047304157161264385
34 -0.039399900538968885
35 -0.06016480113839432
36 -0.02633970498475026
37 -0.3921956749978687
38 -0.13382853208797219
39 -0.010371952277781926
40 -0.002803478146

  print (idx,np.log(tmp[0][idx]))


2961 -0.0035256446776668364
2962 -0.006017106685763091
2963 -0.003927073770609215
2964 -0.005051933281793075
2965 -0.034089125464449435
2966 -0.004569695690065297
2967 -0.0331791696585995
2968 -0.004810785416792608
2969 -0.006661073543725125
2970 -0.009967928192450853
2971 -0.006017106685763091
2972 -0.0036059176056650313
2973 -2.570885997440356
2974 -0.03193965373739133
2975 -0.006983212549490653
2976 -0.03251790330949092
2977 -0.01344788152830057
2978 -0.00448934536259444
2979 -0.00553440353882008
2980 -0.0026430674951714718
2981 -0.0048911615772666275
2982 -0.028559461723063107
2983 -0.0064195373725093804
2984 -0.004569695690065297
2985 -0.0024024997541451745
2986 -0.005051933281793075
2987 -0.0036861969779233878
2988 -0.007466615717138072
2989 -0.3021303190615459
2990 -0.006097579868118445
2991 -0.16051655041681007
2992 -0.002562871817675574
2993 -0.0016010249977693376
2994 -0.01458350786708942
2995 -0.004087690539345106
2996 -0.07118349045474173
2997 -0.07307482526888397
2998 -0.0

In [None]:
tmp=(1-X)*(1-P["likelihod"][0,:])+X*P["likelihod"][0,:]  

In [190]:
np.where(tmp<=0)

(array([], dtype=int64), array([], dtype=int64))

In [186]:
P["likelihod"][0][2959]

0.010638297872340425

In [194]:
X[0][2959]

1

In [195]:
tmp[0][2959]

0.010638297872340425

In [180]:
np.sum(np.log(tmp),axis=1)

  np.sum(np.log(tmp),axis=1)


array([nan, nan, nan, ..., nan, nan, nan])

In [191]:
p_0=np.sum(np.log(tmp),axis=1)+np.log(prior) #shape(N*1)

In [192]:
p_0

array([-284.47707052, -256.47413668, -258.61006863, ..., -264.75211583,
       -297.1967909 , -241.89510546])

In [90]:
p_0.shape

(25000,)

In [85]:
np.log(

  np.log((1-X)*(1-P["likelihod"][0,:])+X*P["likelihod"][0,:])
  np.log((1-X)*(1-P["likelihod"][0,:])+X*P["likelihod"][0,:])


array([[-0.00795153, -0.00304463, -0.00440971, ..., -0.04165568,
        -0.03318457, -0.00585712],
       [-0.00795153, -0.00304463, -0.00440971, ..., -0.04165568,
        -0.03318457, -0.00585712],
       [-0.00795153, -0.00304463, -0.00440971, ..., -0.04165568,
        -0.03318457, -0.00585712],
       ...,
       [-0.00795153, -0.00304463, -0.00440971, ..., -0.04165568,
        -0.03318457, -0.00585712],
       [-0.00795153, -0.00304463, -0.00440971, ..., -0.04165568,
        -0.03318457, -0.00585712],
       [-0.00795153, -0.00304463, -0.00440971, ..., -0.04165568,
        -0.03318457, -0.00585712]])

In [139]:
tmp[0,2912]

-0.9962406015037595

In [196]:
tmp=(1-X)*(1-P["likelihod"][1,:])+X*P["likelihod"][1,:]  
p_1=np.sum(np.log(tmp),axis=1)+np.log(prior) #shape(N*1)


In [197]:
p_1.shape,p_0.shape

((25000,), (25000,))

In [None]:
zip(p_0,p_1)

In [200]:
res=[0 if x else 1 for x in p_0<p_1]


In [202]:
res.shape

AttributeError: 'list' object has no attribute 'shape'