## 库和数据

In [97]:
from sklearn.model_selection import  train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB     # 从sklean.naive_bayes里导入朴素贝叶斯模型
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier, BaggingClassifier
import os
from tqdm.notebook import tqdm
import pandas as pd

In [98]:
PATH = '../外部数据/ChnSentiCorp情感分析酒店评论/'
RANDOM_SEED = 42
posTxt = os.listdir(PATH+'正面')
negTxt = os.listdir(PATH+'负面')

posList = []
negList = []
for i,j in tqdm(zip(posTxt, negTxt)): # 刚好都是2000个就zip了
    posList.append(open(PATH+'正面/'+i, encoding='utf-8').read().replace('\n',''))
    negList.append(open(PATH+'负面/'+j, encoding='utf-8').read().replace('\n',''))

pos = pd.DataFrame(posList, columns=['text'])
pos['flag'] = 1
neg = pd.DataFrame(negList, columns=['text'])
neg['flag'] = 0
all_data = pd.concat([pos, neg]).reset_index(drop=True)

0it [00:00, ?it/s]

## 作业

### 数据

In [99]:
# 数据预处理：训练集和测试集分割，文本特征向量化
X_train,X_test,y_train,y_test = train_test_split(all_data.text,all_data.flag,test_size=0.25,random_state=RANDOM_SEED) # 随机采样25%的数据样本作为测试集

# tfidf文本特征向量化
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

### 原始数据|原始参数|分类
**注意**：使用交叉验证后分数与原始分数有较大的变化0.8->0.6，不知道原因

In [100]:
print('3-fold CV Accuarcy:')
# 逻辑回归
lr = LogisticRegression(random_state=RANDOM_SEED)
lr.fit(X_train,y_train)
scores = cross_val_score(lr, X_test, y_test, cv=3, scoring='accuracy')
print("   LR:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# SVC
svc = SVC(random_state=RANDOM_SEED)
svc.fit(X_train,y_train)
scores = cross_val_score(svc, X_test, y_test, cv=3, scoring='accuracy')
print("   SVC:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# 朴素贝叶斯
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
scores = cross_val_score(mnb, X_test, y_test, cv=3, scoring='accuracy')
print("   NB:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# KNN
knn = KNeighborsClassifier(n_neighbors=1) # 在本任务里越小越好
knn.fit(X_train,y_train)
scores = cross_val_score(knn, X_test, y_test, cv=3, scoring='accuracy')
print("   KNN:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# 随机森林
rf = RandomForestClassifier(random_state=RANDOM_SEED)
rf.fit(X_train,y_train)
scores = cross_val_score(rf, X_test, y_test, cv=3, scoring='accuracy')
print("   RF:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# 梯度提升树
GB = GradientBoostingClassifier(random_state=RANDOM_SEED)
GB.fit(X_train,y_train)
scores = cross_val_score(GB, X_test, y_test, cv=3, scoring='accuracy')
print("   GB:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# AdaBoost
Ada = AdaBoostClassifier(random_state=RANDOM_SEED)
Ada.fit(X_train,y_train)
scores = cross_val_score(Ada, X_test, y_test, cv=3, scoring='accuracy')
print("   Ada:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

3-fold CV Accuarcy:
   LR:	0.630 (+/- 0.019)
   SVC:	0.630 (+/- 0.030)
   NB:	0.639 (+/- 0.010)
   KNN:	0.552 (+/- 0.020)
   RF:	0.651 (+/- 0.018)
   GB:	0.587 (+/- 0.009)
   Ada:	0.560 (+/- 0.013)


### 特征选择数据|原始参数|分类

In [34]:
select_model = SelectKBest(chi2, k=1000) # 选择k个最佳特征
X_train_k = select_model.fit_transform(X_train, y_train)
X_test_k = select_model.transform(X_test)

In [35]:
print('3-fold CV Accuracy:')
# 逻辑回归
lr = LogisticRegression(random_state=RANDOM_SEED)
lr.fit(X_train_k,y_train)
y_pred = lr.predict(X_test)
scores = cross_val_score(lr, X_test_k, y_test, cv=3, scoring='accuracy')
print("   LR:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# SVC
svc = SVC(random_state=RANDOM_SEED)
svc.fit(X_train_k,y_train)
scores = cross_val_score(svc, X_test_k, y_test, cv=3, scoring='accuracy')
print("   SVC:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# 朴素贝叶斯
mnb = MultinomialNB()
mnb.fit(X_train_k,y_train)
scores = cross_val_score(mnb, X_test_k, y_test, cv=3, scoring='accuracy')
print("   NB:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# 随机森林
rf = RandomForestClassifier(random_state=RANDOM_SEED)
rf.fit(X_train_k,y_train)
scores = cross_val_score(rf, X_test_k, y_test, cv=3, scoring='accuracy')
print("   RF:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# 梯度提升树
GB = GradientBoostingClassifier(random_state=RANDOM_SEED)
GB.fit(X_train_k,y_train)
scores = cross_val_score(GB, X_test_k, y_test, cv=3, scoring='accuracy')
print("   GB:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# AdaBoost
Ada = AdaBoostClassifier(random_state=RANDOM_SEED)
Ada.fit(X_train_k,y_train)
scores = cross_val_score(Ada, X_test_k, y_test, cv=3, scoring='accuracy')
print("   Ada:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

3-fold Accuracy:
   LR:	0.568 (+/- 0.013)
   SVC:	0.619 (+/- 0.010)
   NB:	0.569 (+/- 0.015)
   RF:	0.636 (+/- 0.021)
   GB:	0.572 (+/- 0.012)
   Ada:	0.575 (+/- 0.008)


### 简单集成

#### 原始数据

In [75]:
# 投票
voting = VotingClassifier(
    estimators=list(zip(['LR','SVC','NB','RF','GB','Ada'],[lr, svc, mnb, rf, GB, Ada])), 
    voting='hard',
    verbose=False,
)
voting.fit(X_train, y_train)
scores = cross_val_score(voting, X_test, y_test, scoring='accuracy')
print("   Voting:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# Boosting看GB和Ada效果不好就不做了

# stacking
stacking = StackingClassifier(
    estimators=list(zip(['LR','SVC','NB','RF','GB','Ada'],[lr, svc, mnb, rf, GB, Ada])),
    final_estimator=lr,
    cv=3,
    verbose=False,
    n_jobs=4,
)
stacking.fit(X_train, y_train)
scores = cross_val_score(stacking, X_test, y_test, scoring='accuracy')
print("   Stacking:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# bagging 看rf效果不错，应该是bagging对本任务比较有效
## lr
bagging_lr = BaggingClassifier(
    base_estimator=lr,
    random_state=RANDOM_SEED,
    verbose=False,
    n_jobs=4,
)
bagging_lr.fit(X_train, y_train)
scores = cross_val_score(bagging_lr, X_test, y_test, scoring='accuracy')
print("   Bagging_lr:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

## SVC
bagging_svc = BaggingClassifier(
    base_estimator=svc,
    random_state=RANDOM_SEED,
    verbose=False,
    n_jobs=4,
)
bagging_svc.fit(X_train, y_train)
scores = cross_val_score(bagging_svc, X_test, y_test, scoring='accuracy')
print("   Bagging_svc:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

## NB
bagging_mnb = BaggingClassifier(
    base_estimator=mnb,
    random_state=RANDOM_SEED,
    verbose=False,
    n_jobs=4,
)
bagging_mnb.fit(X_train, y_train)
scores = cross_val_score(bagging_mnb, X_test, y_test, scoring='accuracy')
print("   Bagging_nb:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

   Voting:	0.644 (+/- 0.028)
   Stacking:	0.672 (+/- 0.011)
   Bagging_lr:	0.643 (+/- 0.029)
   Bagging_svc:	0.640 (+/- 0.027)
   Bagging_nb:	0.646 (+/- 0.031)


#### 特征选择后数据

In [74]:
# 投票
voting = VotingClassifier(
    estimators=list(zip(['LR','SVC','NB','RF','GB','Ada'],[lr, svc, mnb, rf, GB, Ada])), 
    voting='hard',
    verbose=False,
)
voting.fit(X_train_k, y_train)
scores = cross_val_score(voting, X_test_k, y_test, scoring='accuracy')
print("   Voting:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# stacking
stacking = StackingClassifier(
    estimators=list(zip(['LR','SVC','NB','RF','GB','Ada'],[lr, svc, mnb, rf, GB, Ada])),
    final_estimator=lr,
    cv=3,
    verbose=False,
    n_jobs=4,
)
stacking.fit(X_train_k, y_train)
scores = cross_val_score(stacking, X_test_k, y_test, scoring='accuracy')
print("   Stacking:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

# bagging 看rf效果不错，应该是bagging对本任务比较有效
## lr
bagging_lr = BaggingClassifier(
    base_estimator=lr,
    random_state=RANDOM_SEED,
    verbose=False,
    n_jobs=4,
)
bagging_lr.fit(X_train_k, y_train)
scores = cross_val_score(bagging_lr, X_test_k, y_test, scoring='accuracy')
print("   Bagging_lr:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

## SVC
bagging_svc = BaggingClassifier(
    base_estimator=svc,
    random_state=RANDOM_SEED,
    verbose=False,
    n_jobs=4,
)
bagging_svc.fit(X_train_k, y_train)
scores = cross_val_score(bagging_svc, X_test_k, y_test, scoring='accuracy')
print("   Bagging_svc:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

## NB
bagging_mnb = BaggingClassifier(
    base_estimator=mnb,
    random_state=RANDOM_SEED,
    verbose=False,
    n_jobs=4,
)
bagging_mnb.fit(X_train_k, y_train)
scores = cross_val_score(bagging_mnb, X_test_k, y_test, scoring='accuracy')
print("   Bagging_nb:\t%0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

   Voting:	0.641 (+/- 0.027)
   Stacking:	0.668 (+/- 0.030)
   Bagging_lr:	0.614 (+/- 0.032)
   Bagging_svc:	0.613 (+/- 0.019)
   Bagging_nb:	0.585 (+/- 0.020)


其实挺不理解特征维度远大于数据时模型如何妥善处理的。。

### 结果报告
只看效果较好的

In [96]:
print('LR')
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('NB')
y_pred = mnb.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('SVC')
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('RF')
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('Voting')
y_pred = voting.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('Stacking')
y_pred = stacking.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('mnb')
y_pred = mnb.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('Bagging_LR')
y_pred = bagging_lr.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('Bagging_SVC')
y_pred = bagging_svc.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

print('Bagging_NB')
y_pred = bagging_mnb.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['0','1']))
print('------------------------------------------------------')

LR
              precision    recall  f1-score   support

           0       0.74      0.92      0.82       476
           1       0.91      0.71      0.80       524

    accuracy                           0.81      1000
   macro avg       0.83      0.82      0.81      1000
weighted avg       0.83      0.81      0.81      1000

------------------------------------------------------
NB
              precision    recall  f1-score   support

           0       0.76      0.88      0.82       476
           1       0.88      0.75      0.81       524

    accuracy                           0.81      1000
   macro avg       0.82      0.82      0.81      1000
weighted avg       0.82      0.81      0.81      1000

------------------------------------------------------
SVC
              precision    recall  f1-score   support

           0       0.73      0.94      0.82       476
           1       0.93      0.69      0.79       524

    accuracy                           0.81      1000
   macro