# 基于机器学习的文本分类    



## 读取数据与处理

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('input/train.tsv', sep='\t')
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
X_all = train_df['Phrase']
y_all = train_df['Sentiment']
len(X_all)

156060

In [4]:
from sklearn.model_selection import train_test_split
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.8, random_state=42) 
# 将训练集划分为训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, random_state=42)
print(X_all.shape)
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

(156060,)
(99878,)
(24970,)
(31212,)


In [5]:
# 这里只划为为训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.8, random_state=42) 

## 提取特征

In [6]:
# 词典计数
from sklearn.feature_extraction.text import CountVectorizer
vectorizer  = CountVectorizer()  
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)
print(X_train_counts.shape)
print(X_test_counts.shape)

(124848, 15228)
(31212, 15228)


In [7]:
# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(X_train)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(124848, 10000)
(31212, 10000)


In [8]:
# TFIDF ngram
vectorizer = TfidfVectorizer(ngram_range=(2,2), max_features=10000)
vectorizer.fit(X_train)
X_train_tfidf_ngram = vectorizer.transform(X_train)
X_test_tfidf_ngram = vectorizer.transform(X_test)
print(X_train_tfidf_ngram.shape)
print(X_test_tfidf_ngram.shape)

(124848, 10000)
(31212, 10000)


### 合并特征    
可以使用不同的特征测试

In [9]:
from scipy.sparse import hstack
train_features = hstack([X_train_counts, X_train_tfidf, X_train_tfidf_ngram])
test_features = hstack([X_test_counts, X_test_tfidf, X_test_tfidf_ngram])
print(train_features.shape)

(124848, 35228)


## 模型    
可以使用不同的模型，学习率

In [10]:
# 逻辑回归
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 评估

In [11]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred = clf.predict(test_features)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.57      0.31      0.40      1416
          1       0.57      0.43      0.49      5527
          2       0.69      0.87      0.77     15639
          3       0.60      0.49      0.54      6707
          4       0.61      0.37      0.46      1923

avg / total       0.64      0.66      0.64     31212

[[  441   699   261    15     0]
 [  274  2356  2731   158     8]
 [   56   904 13657   979    43]
 [    7   137  2865  3291   407]
 [    1    11   182  1020   709]]
0.6553248750480585
