# 获取电影评论数据集

In [1]:
import pyprind
import pandas as pd
import os
import numpy as np
pbar = pyprind.ProgBar(50000) #生成一个50000次迭代的进度条
labels = {'pos':1,'neg':0}
df = pd.DataFrame()
home_path='/Users/fannian/'

对语料库文件夹进行遍历，将测试集test\训练集train\正例样本集pos\负例样本集neg导入dataFrame
os.listdir(path) 会返回包含路径path下所有文件的列表
os.path.join(path,file) 会返回一个path和file的组合地址(用/连接)

In [2]:
for s in ('test','train'):
    for l in ('pos','neg'):
        path = home_path+'tool_data/aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()#更新进度条

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:05:28


np.random.permutation()对数据结构(df)的索引编号进行随机打乱
df.reindex(newindex) 用新的索引编号替换原有索引，若原有索引数据不存在，则为NaN

In [3]:
df.columns = ['review', 'sentiment']
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv(home_path+'tool_data/outdata/movie_data.csv',index = False)

# 词袋模型

In [4]:
import nltk
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

## 文本转稀疏向量矩阵、TF-IDF计算单词重要性

In [5]:
count = CountVectorizer()#词频：训练后生成一个词频字典
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)#设置numpy显示
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)#生成特征词向量的压缩行稀疏矩阵
print(count.vocabulary_) #返回一个词频字典
tff = tfidf.fit_transform(bag) #计算词向量矩阵的TFIDF矩阵，并进行归一化处理
bag.toarray()#将压缩行系数矩阵转为ndarray格式

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 2, 1]], dtype=int64)

In [6]:
tff.toarray()

array([[0.  , 0.43, 0.56, 0.56, 0.  , 0.43, 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.56, 0.43, 0.56],
       [0.4 , 0.48, 0.31, 0.31, 0.31, 0.48, 0.31]])

## 清洗文本数据

In [7]:
df = pd.read_csv(home_path+'tool_data/outdata/movie_data.csv')
dr = df.loc[0,'review'][-50:]
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)#正则表达式将Html符号替换掉
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)#找出表情符号并以列表形式存储
    #替换所有非单词字符,将文本转换为小写，并将表情符号追加到文本之后（删除代表鼻子的字符-）
    text = re.sub('[\W]+',' ',text.lower()) + ''.join(emoticons).replace('-','')
    return text
preprocessor(dr)

'to star cinema way to go jericho and claudine '

In [8]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :):(:)'

In [9]:
df['review'] = df['review'].apply(preprocessor)

## 标记文档

In [10]:
porter = PorterStemmer() #词干提取算法

def tokenizer_porter(text):
    """以空格符分割字符串，返回一个单词列表,并对该列表每个元素进行词干提取"""
    return [porter.stem(word) for word in text.split()]
def tokenizer(text):
    """以空格符分割字符串，返回一个单词列表"""
    return text.split()

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [11]:
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# 逻辑回归应用于文档分类

In [12]:
X = df.loc[:,'review'].values
y = df.loc[:,'sentiment'].values
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.5)
nb = MultinomialNB(alpha=0.1)
lr = LogisticRegression(C=10.0,penalty='l2')
tfidf_vect =  TfidfVectorizer(stop_words=None,tokenizer=tokenizer)
nb_tfidf = Pipeline([('vect',tfidf_vect),('clf',lr)])
nb_tfidf.fit(X_train,y_train)
nb_tfidf.score(X_test,y_test)



0.89872

# 在线算法与外存学习

In [13]:
def tokenizer(text):
    '''清理movie_data.csv文件中未经处理的文本'''
    text =re.sub('<[^>]*>','',text)
    emoticons=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    text=re.sub('[\W]+',' ',text.lower()) + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    '''每次读取且返回一个文档的内容'''
    with open(path,'r') as csv:
        next(csv)
        for line in csv:
            text,label = line[:-3],int(line[-2])
            yield text,label
 
print (next(stream_docs(path=home_path+'tool_data/outdata/movie_data.csv'))[0][:10])

def get_minibatch(doc_stream,size):
    '''以stream_doc函数得到的文档数据流作为输入，
    并通过参数size返回指定数量的文档内容'''
    docs,y=[],[]
    try:
        for _ in range(size):
            text,label=next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None,None
    return docs,y

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',n_features=2**21,
                         preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log',random_state=1)
doc_stream = stream_docs(path=home_path+'tool_data/outdata/movie_data.csv')

pbar=pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train,y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update
    
X_test,y_test=get_minibatch(doc_stream,size=5000)
X_test =vect.transform(X_test)
print ('Accuracy: %.3f' % clf.score(X_test,y_test))

"My family
Accuracy: 0.868


# 序列化通过scikit-learn拟合的模型

In [17]:
import pickle
import os
dest = os.path.join('movieclassifier','pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=4)#protocol协议
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=4)