# Sentiment classification on large movie reviews

这里我们使用 __[imdb Moview Reviews](http://ai.stanford.edu/~amaas/data/sentiment/)__ 的数据集，通过 Logistic Regression 算法进行情感分类(**Postive / Negative**)的划分。

## 初始化

In [23]:
import datetime

import numpy as np
from pandas import DataFrame
from os import listdir
from os.path import join

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from nltk.stem.snowball import EnglishStemmer

from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

## 准备 Training set & Test set

In [2]:
# 根据路径(full path)，读取文件中的文本内容
def read_text(path):
    return open(path, 'r').read()


# 遍历指定目录，返回该目录下所有文件的 Path, Text 文本内容键值对的列表
def read_files(directory):
    for path in [join(directory, f) for f in listdir(directory)]:
        yield path, read_text(path)


# 根据指定目录下所有文件和分类定义，构建 DataFrame 数据集
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

In [5]:
# 定义情感分类的 Labels
POS = 'positive'
NEG = 'negative'

SOURCES = [
    ('aclImdb/train/pos/', POS),
    ('aclImdb/train/neg/', NEG)
]
# 准备训练数据
train = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    train = train.append(build_data_frame(path, classification))

# 将训练数据随机打乱
train = train.reindex(np.random.permutation(train.index))

TEST_SOURCES = [
    ('aclImdb/test/pos/', POS),
    ('aclImdb/test/neg/', NEG)
]
# 准备测试数据
test = DataFrame({'text': [], 'class': []})
for path, classification in TEST_SOURCES:
    test = test.append(build_data_frame(path, classification))
    
print(train.sample(10))
print(test.sample(10))

                                  class  \
aclImdb/train/pos/7923_8.txt   positive   
aclImdb/train/pos/4567_10.txt  positive   
aclImdb/train/pos/6347_9.txt   positive   
aclImdb/train/neg/1037_1.txt   negative   
aclImdb/train/neg/7389_2.txt   negative   
aclImdb/train/neg/11143_1.txt  negative   
aclImdb/train/pos/4680_10.txt  positive   
aclImdb/train/pos/5630_8.txt   positive   
aclImdb/train/neg/2548_1.txt   negative   
aclImdb/train/neg/9350_1.txt   negative   

                                                                            text  
aclImdb/train/pos/7923_8.txt   While visiting Romania with his CIA dad, Tony(...  
aclImdb/train/pos/4567_10.txt  My left foot is an epic outstanding film expla...  
aclImdb/train/pos/6347_9.txt   How The Grinch Stole Christmas instantly stole...  
aclImdb/train/neg/1037_1.txt   That this poor excuse for an amateur hour show...  
aclImdb/train/neg/7389_2.txt   I buy or at least watch every Seagall movie. H...  
aclImdb/train/neg/11143_1.tx

In [12]:
print('Pos: ', len(train[train['class'] == POS]))
print('Neg: ', len(train[train['class'] == NEG]))

Pos:  12500
Neg:  12500


可以看出 **Positive & Negative** 正反两类的训练数据刚好相等，因此可以不用针对分类的数据个数做 normalization.

In [13]:
# 定义分类算法的执行封装，并打印不同分类算法的耗时等信息
# 根据之前的文本处理经验，默认使用 TfidfVectorizer 进行分词标记处理。
def do_classify(tag, classifier, vectorizer=TfidfVectorizer()):
    text_clf = Pipeline([('tfidf', vectorizer),
                         (tag, classifier)])
    print(tag + " start...")
    start_time = datetime.datetime.now()
    text_clf = text_clf.fit(train['text'], train['class'])
    end_time = datetime.datetime.now()
    print("   training time: " + str(end_time - start_time))
    predicted = text_clf.predict(test['text'])
    end_time = datetime.datetime.now()
    print("   classification time: " + str(end_time - start_time))
    print("   accuracy: ", np.mean(predicted == test['class']))

## 使用 Logistic Regression 处理

#### 标准算法处理

In [14]:
do_classify("LogisticRegression", LogisticRegression(solver='liblinear'))

LogisticRegression start...
   training time: 0:00:07.582187
   classification time: 0:00:13.410926
   accuracy:  0.88312


#### 将之前的分词由单词扩展为至多由2个单词组成的词组的形式

In [16]:
do_classify("LogisticRegression", LogisticRegression(solver='liblinear'), TfidfVectorizer(ngram_range=(1, 2)))

LogisticRegression start...
   training time: 0:00:34.893012
   classification time: 0:00:50.044682
   accuracy:  0.88628


可以看出由于对分词的扩展造成了耗时的显著增加，然后准确率的提升并不明显

#### 加入对英文 Stemming 处理

In [18]:
stemmer = EnglishStemmer()
analyzer = TfidfVectorizer().build_analyzer()


def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))


stem_vectorizer = TfidfVectorizer(analyzer=stemmed_words, ngram_range=(1, 2))

do_classify("LogisticRegression", LogisticRegression(solver='liblinear'), stem_vectorizer)

LogisticRegression start...
   training time: 0:01:44.900171
   classification time: 0:03:27.072241
   accuracy:  0.8812


可以看出 Stemming 仍然未对结果有一些正面影响

#### 尝试通过调参优化结果
这里我尝试通过调整正则化项即罚函数（**Inverse of regularization strength**）进行优化，该项对模型向量进行“惩罚”，从而避免单纯最小二乘问题的过拟合问题。


In [34]:
# Choose the type of classifier.
clf = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(solver='liblinear'))])

# Choose some parameter combinations to try
parameters = {'lr__C': (0.01, 0.1, 1.0, 10.0, 100.0)}

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(train['text'], train['class'])

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

In [40]:
# 打印最佳的参数
print(grid_obj.best_estimator_.get_params())
# 用最佳方案预测结果
predicted = clf.predict(test['text'])
print("   accuracy: ", np.mean(predicted == test['class']))

{'memory': None, 'steps': [('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('lr', LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))], 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_ra

可以看出结果仍然没有得到什么优化。

### 结论

Logistic Regression 进行情感分类的算法准确率最后稳定在 88% 左右，通过一些数据处理和参数调优仍未发现有较大的优化空间。
后续希望随着课程的深入，通过 Deep Learning 的算法优化。