In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_csv('train_set.csv', sep='\t', nrows=15000)

In [3]:
train_df

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...
...,...,...
14995,5,1822 6040 5744 5310 4578 4407 6242 2313 3466 2...
14996,9,88 7400 7539 4516 6122 290 6831 465 1647 6293 ...
14997,0,2597 7160 2282 1407 4403 4516 2873 4597 7037 5...
14998,0,2400 4411 4721 3289 5787 5096 4464 6250 1324 6...


In [4]:
vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])

In [5]:
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [6]:
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.65441877581244


In [7]:
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])

In [8]:
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [9]:
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.8719098297954606


### 1. 尝试改变TF-IDF的参数，并验证精度

In [10]:
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=["3750", "900", "648"], max_df=0.5, min_df=2, max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])

In [11]:
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [12]:
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.8711316513064276


### 2. 尝试使用其他机器学习模型，完成训练和验证

In [14]:
hashing = HashingVectorizer(stop_words=["3750", "900", "648"])

In [15]:
train_test = hashing.fit_transform(train_df['text'])

In [16]:
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [17]:
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

0.8592303257299841
