In [1]:
import time
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier


## One-Hot + RidgeClassifier

In [None]:
def onehot_ridgeclassifier(nrows, train_num, max_features):
    start_time = time.time()

    train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=nrows)

    # shuffle
    train_df = shuffle(train_df, random_state=666)

    vectorizer = CountVectorizer(max_features=max_features)
    train_text = vectorizer.fit_transform(train_df['text'])

    clf = RidgeClassifier(random_state=666)
    clf.fit(train_text[:train_num], train_df['label'].values[:train_num])

    train_pred = clf.predict(train_text[:train_num])
    val_pred = clf.predict(train_text[train_num:])
    print('One-Hot+RidgeClassifier Train f1_score: {}'.format(f1_score(train_df['label'].values[:train_num], train_pred, average='macro')))
    print('One-Hot+RidgeClassifier Val f1_score: {}'.format(f1_score(train_df['label'].values[train_num:], val_pred, average='macro')))
    train_time = time.time()
    print('Train time: {:.2f}s'.format(train_time - start_time))


    # 预测并保存
    test_df = pd.read_csv('../input/test_a.csv')
    test_text = vectorizer.transform(test_df['text'])

    test_pred = clf.predict(test_text)
    test_pred = pd.DataFrame(test_pred, columns=['label'])
    test_pred.to_csv('../input/test_bagofwords_ridgeclassifier.csv', index=False)
    print('Test predict saved.')
    end_time = time.time()
    print('Predict time:{:.2f}s'.format(end_time - train_time))
    
    
if __name__ == '__main__':
    # nrows = 200000
    # train_num = int(nrows * 0.7)
    # max_features = 3000

    """
    One-Hot+RidgeClassifier Train f1_score: 0.8325002267944408
    One-Hot+RidgeClassifier Val f1_score: 0.8175875672165276
    Train time: 685.49s
    Test predict saved.
    Predict time:32.44s
    """
    
    nrows = 200000
    train_num = int(nrows * 0.7)
    max_features = 4000
    
    """
    One-Hot+RidgeClassifier Train f1_score: 0.8377852607681573
    One-Hot+RidgeClassifier Val f1_score: 0.8178684044527644
    Train time: 1058.56s
    Test predict saved.
    Predict time:31.95s
    """
    
    onehot_ridgeclassifier(nrows, train_num, max_features)


## TF-IDF + RidgeClassifier

In [None]:
def tfidf_ridgeclassifier(nrows, train_num, max_features, ngram_range):
    start_time = time.time()

    train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=nrows)

    # shuffle
    train_df = shuffle(train_df, random_state=666)

    tfidf = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    train_text = tfidf.fit_transform(train_df['text'])

    # TF-IDF
    clf = RidgeClassifier(random_state=666)
    clf.fit(train_text[:train_num], train_df['label'].values[:train_num])

    train_pred = clf.predict(train_text[:train_num])
    val_pred = clf.predict(train_text[train_num:])
    print('Tf-Idf+RidgeClassifier Train f1_score: {}'.format(f1_score(train_df['label'].values[:train_num], train_pred, average='macro')))
    print('Tf-Idf+RidgeClassifier Val f1_score: {}'.format(f1_score(train_df['label'].values[train_num:], val_pred, average='macro')))
    train_time = time.time()
    print('Train time: {:.2f}s'.format(train_time - start_time))


    # 预测并保存
    test_df = pd.read_csv('../input/test_a.csv')
    test_text = tfidf.transform(test_df['text'])

    test_pred = clf.predict(test_text)
    test_pred = pd.DataFrame(test_pred, columns=['label'])
    test_pred.to_csv('../input/test_tfidf_ridgeclassifier.csv', index=False)
    print('Test predict saved.')
    end_time = time.time()
    print('Predict time:{:.2f}s'.format(end_time - train_time))
    
    
if __name__ == '__main__':
    # nrows = 200000
    # train_num = int(nrows * 0.7)
    # max_features = 3000
    # ngram_range = (1, 3)
    
    """
    Tf-Idf+RidgeClassifier Train f1_score: 0.903158570543211
    Tf-Idf+RidgeClassifier Val f1_score: 0.8941037520383751
    Train time: 743.38s
    Test predict saved.
    Predict time:105.46s
    """
    
    # nrows = 200000
    # train_num = int(nrows * 0.7)
    # max_features = 4000
    # ngram_range = (1, 3)
    
    """
    Tf-Idf+RidgeClassifier Train f1_score: 0.9123200043177631
    Tf-Idf+RidgeClassifier Val f1_score: 0.9017549150589862
    Train time: 800.63s
    Test predict saved.
    Predict time:110.57s
    """
    
    nrows = 200000
    train_num = int(nrows * 0.7)
    max_features = 4000
    ngram_range = (1, 2)
    
    """
    Tf-Idf+RidgeClassifier Train f1_score: 0.9138842752839392
    Tf-Idf+RidgeClassifier Val f1_score: 0.9029483134740949
    Train time: 476.29s
    Test predict saved.
    Predict time:68.93s
    """
    
    tfidf_ridgeclassifier(nrows, train_num, max_features, ngram_range)
