In [1]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels={'pos':1,'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = 'D:/My Documents/GitHub/Quantitative/Python Machin Learning_code/aclImdb/%s/%s' %(s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r',encoding='utf8') as infile:
                txt = infile.read()
            df = df.append([[txt,labels[l]]],ignore_index=True)
            pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:46


In [2]:
df.columns = ['review','sentiment']
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [3]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv',index=False,encoding='utf8')

In [4]:
df = pd.read_csv('./movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


## 词袋模型

In [5]:
#将单词转为特征向量
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [6]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


### 通过词频-逆文档频率(term frequency-inverse document frequency, tf-idf)计算单词关联度

$$ tf-idf(t,d) = tf(t,d) \times idf(td)$$
tf: 词频
idf: 逆文档频率
$$idf(t,d)=log\frac{n_d}{1+df(d,t)}$$ 
其中，n<sub>d</sub>为文档总数,df(d,t)为包含词汇t的文档d的数量<p>
scikit-learn计算有所不同，
$$idf(td)=log\frac{1+n_d}{1+df(d,t)}$$
$$tf-idf(t,d) = tf(t,d)\times (idf(t,d)+1)$$
在计算tf-idf前都对原始词频做l2归一化处理
$$v_norm=\frac{v}{\left\|v\right\|_2} = \frac{v}{\sqrt{v_1^2 + v_2^2 + \dots + v_n^2}}$$

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag))
print(tfidf.fit_transform(bag).toarray())

  (0, 5)	0.433707859509
  (0, 3)	0.558477835371
  (0, 1)	0.433707859509
  (0, 2)	0.558477835371
  (1, 5)	0.433707859509
  (1, 1)	0.433707859509
  (1, 6)	0.558477835371
  (1, 4)	0.558477835371
  (2, 5)	0.478101718197
  (2, 3)	0.307821505665
  (2, 1)	0.478101718197
  (2, 2)	0.307821505665
  (2, 6)	0.307821505665
  (2, 4)	0.307821505665
  (2, 0)	0.404748288093
[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [8]:
#清洗文本数据
df.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [9]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)
    emotions = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('[\W]+',' ',text.lower()) + ' '.join(emotions).replace('-','')
    return text

preprocessor(df.loc[0,'review'][-50:])

'is seven title brazil not available'

In [10]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [11]:
df['review'] = df['review'].apply(preprocessor)

In [12]:
#标记文档，词干提取（word stemming）使单词恢复原始形式
def tokenizer(text):
    return text.split()

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_poter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer('runners like running and thus they run')
tokenizer_poter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [13]:
#停用词移除(stop-word removal) 常见的停用词有is, and, has等
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\donaldxu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_poter('a runner likes running and runs a lot')[:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## 训练模型

In [None]:
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer,tokenizer_poter],
              'clf__penalty':['l1','l2'],
              'clf__C':[1.0,10.0,100.0]},
             {'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__use_idf':[False],
              'vect__norm':[None],
              'vect__tokenizer':[tokenizer,tokenizer_poter],
              'clf__penalty':['l1','l2'],
              'clf__C':[1.0,10.0,100.0]}]
lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)
print('Best parameter set: %s' % gs_lr_tfidf.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [None]:
gs_lr_tfidf.best_params_