In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
train_tsv = pd.read_csv('./data/train.tsv', sep='\t')
x_train = train_tsv['Phrase']
y_train = train_tsv['Sentiment']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
counter = CountVectorizer()
train_counts = counter.fit_transform(x_train)

In [4]:
counter.vocabulary_

{'series': 11837,
 'of': 9227,
 'escapades': 4577,
 'demonstrating': 3490,
 'the': 13505,
 'adage': 288,
 'that': 13503,
 'what': 14871,
 'is': 7217,
 'good': 5821,
 'for': 5323,
 'goose': 5837,
 'also': 529,
 'gander': 5595,
 'some': 12424,
 'which': 14888,
 'occasionally': 9204,
 'amuses': 602,
 'but': 1879,
 'none': 9085,
 'amounts': 593,
 'to': 13681,
 'much': 8807,
 'story': 12857,
 'this': 13556,
 'quiet': 10585,
 'introspective': 7143,
 'and': 625,
 'entertaining': 4500,
 'independent': 6840,
 'worth': 15096,
 'seeking': 11750,
 'even': 4642,
 'fans': 4942,
 'ismail': 7222,
 'merchant': 8449,
 'work': 15068,
 'suspect': 13190,
 'would': 15100,
 'have': 6210,
 'hard': 6156,
 'time': 13644,
 'sitting': 12159,
 'through': 13598,
 'one': 9272,
 'positively': 10123,
 'thrilling': 13591,
 'combination': 2573,
 'ethnography': 4621,
 'all': 506,
 'intrigue': 7133,
 'betrayal': 1348,
 'deceit': 3353,
 'murder': 8831,
 'shakespearean': 11909,
 'tragedy': 13799,
 'or': 9326,
 'juicy': 7400

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=50000)
train_features = tfidf.fit_transform(x_train)

In [6]:
tfidf.vocabulary_

{'series of': 35248,
 'that what': 39413,
 'what is': 47661,
 'is good': 20716,
 'good for': 16256,
 'for the': 14807,
 'the goose': 40574,
 'is also': 20547,
 'some of': 36510,
 'of which': 29324,
 'but none': 7228,
 'none of': 27208,
 'which amounts': 47853,
 'amounts to': 1524,
 'to much': 44455,
 'much of': 26443,
 'of story': 28905,
 'is good for': 20717,
 'good for the': 16257,
 'for the goose': 14824,
 'but none of': 7230,
 'none of which': 27213,
 'of which amounts': 29325,
 'which amounts to': 47854,
 'amounts to much': 1526,
 'to much of': 44456,
 'much of story': 26451,
 'and entertaining': 2396,
 'is worth': 21063,
 'fans of': 13325,
 'would have': 49353,
 'have hard': 17263,
 'hard time': 16956,
 'sitting through': 35950,
 'through this': 43531,
 'this one': 43180,
 'have hard time': 17264,
 'sitting through this': 35952,
 'through this one': 43532,
 'combination of': 8924,
 'and all': 2007,
 'all the': 1178,
 'the intrigue': 40786,
 'betrayal deceit': 6253,
 'deceit and':

In [7]:
from scipy.sparse import hstack
train_ft = hstack([train_counts, train_features])
print(train_ft.shape)

(156060, 65240)


**模型训练**

In [8]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier()

In [9]:
clf.fit(train_ft, y_train)

SGDClassifier()

**模型预测**

In [10]:
test_tsv = pd.read_csv('./data/test.tsv', sep='\t')
x_test = test_tsv['Phrase']
test_counts = counter.transform(x_test)
test_features = tfidf.transform(x_test)
test_ft = hstack([test_counts, test_features])
y_test = clf.predict(test_ft)

In [11]:
test_tsv['Sentiment'] = y_test

In [12]:
test_tsv[['PhraseId', 'Sentiment']].to_csv('sklearn_result.csv', index=False)

**测试结果**

BoW: 0.58545

Tfidf: 0.55975

BoW + Tfidf 3-gram: 0.59070

BoW + Tfidf 3-gram: 0.57690 (loss = logloss)