In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss

%matplotlib inline

train = pd.read_csv("D:\\FMI2\\ML\\spooky-author-identification\\train.zip", index_col=['id'])
test = pd.read_csv("D:\\FMI2\\ML\\spooky-author-identification\\test.zip", index_col=['id'])
sample_submission = pd.read_csv("D:\\FMI2\\ML\\spooky-author-identification\\sample_submission.zip", index_col=['id'])

train.author = train.author.replace(['EAP', 'HPL', 'MWS'], ['Едгар', 'Хауърд', 'Мери'])

In [2]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.42530307 -0.418245   -0.42500535]


In [3]:
pipeline = pipeline.fit(train.text, train.author)
test_predictions = pipeline.predict_proba(test.text)
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.to_csv("D:\\FMI2\\ML\\spooky-author-identification\\submit_Tfidf_MNB_text.csv")

In [4]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2),
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.3988486  -0.39707575 -0.40529345]


In [5]:
pipeline = pipeline.fit(train.text, train.author)
test_predictions = pipeline.predict_proba(test.text)
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.to_csv("D:\\FMI2\\ML\\spooky-author-identification\\submit_Tfidf_MNB_text.csv")

In [6]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False)),
    ('clf', SGDClassifier(loss='log', random_state=0))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.70509999 -0.70667797 -0.69946836]


In [7]:
pipeline.steps[1][1]

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False)

In [8]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False)),
    ('clf', SGDClassifier(loss='modified_huber', random_state=0, alpha=1e-4, n_iter=100))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.63184389 -0.63081907 -0.5722206 ]


In [9]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False)),
    ('clf', SGDClassifier(loss='log', random_state=0, alpha=1, n_iter=100))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.09820115 -1.09797693 -1.09762227]


In [10]:
def test_model(alpha=1e-3, n_iter=10):
    pipeline = Pipeline([
        ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False)),
        ('clf', SGDClassifier(loss='log', random_state=0, alpha=alpha, n_iter=n_iter))])

    print(alpha, n_iter, cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

In [11]:
alphas = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
for alp in alphas:
    test_model(alpha=alp, n_iter=10)

0.01 10 [-1.07359733 -1.07307711 -1.07308271]
0.001 10 [-0.97264801 -0.9729021  -0.97046414]
0.0001 10 [-0.7063961  -0.70872145 -0.70116382]
1e-05 10 [-0.49285044 -0.49891398 -0.48704205]
1e-06 10 [-0.45575614 -0.46793044 -0.45442104]
1e-07 10 [-0.98692794 -1.03459244 -1.02393951]
1e-08 10 [-2.57569938 -2.63831548 -2.63765597]
1e-09 10 [-3.88748023 -3.96421051 -3.76013401]


In [12]:
alphas = [1e-5, 1e-6, 1e-7]
for alp in alphas:
    test_model(alpha=alp, n_iter=100)

1e-05 100 [-0.49507354 -0.50049874 -0.48907863]
1e-06 100 [-0.44800006 -0.45960342 -0.4448511 ]
1e-07 100 [-0.50930075 -0.53008824 -0.50907018]


In [13]:
n_iters = [30, 100, 300]
for ite in n_iters:
    test_model(alpha=1e-6, n_iter=ite)

1e-06 30 [-0.44770492 -0.45950639 -0.44517457]
1e-06 100 [-0.44800006 -0.45960342 -0.4448511 ]
1e-06 300 [-0.44800974 -0.4596065  -0.44489795]


In [14]:
alphas = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
for alp in alphas:
    test_model(alpha=alp, n_iter=100)

0.01 100 [-1.07301127 -1.07276739 -1.07259579]
0.001 100 [-0.97276524 -0.97302624 -0.97058951]
0.0001 100 [-0.7081094  -0.71018585 -0.70275404]
1e-05 100 [-0.49507354 -0.50049874 -0.48907863]
1e-06 100 [-0.44800006 -0.45960342 -0.4448511 ]
1e-07 100 [-0.50930075 -0.53008824 -0.50907018]
1e-08 100 [-1.4098478  -1.46572732 -1.45025675]
1e-09 100 [-3.59478265 -3.70860653 -3.53668744]


In [15]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False)),
    ('clf', MultinomialNB(alpha=0.01))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.38631528 -0.38497049 -0.38864912]


In [16]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, smooth_idf=False)),
    ('clf', MultinomialNB(alpha=0.01))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.40220847 -0.40079718 -0.40916863]


In [17]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, sublinear_tf=True)),
    ('clf', MultinomialNB(alpha=0.01))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.39913227 -0.39735237 -0.40561134]


In [18]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False, analyzer='word')),
    ('clf', MultinomialNB(alpha=0.01))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.38631528 -0.38497049 -0.38864912]


In [32]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 5), lowercase=False, use_idf=False, analyzer='char')),
    ('clf', MultinomialNB(alpha=0.01))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))
# ngram_range(1, 5) [-0.52106595 -0.51884756 -0.51566132]
# ngram_range(1, 6) [-0.43406913 -0.43222579 -0.42966077]
# ngram_range(1, 7) [-0.39664403 -0.39699232 -0.39457353]
# ngram_range(1, 8) [-0.38826457 -0.39055307 -0.38741299]
# ngram_range(1, 9) [-0.39508701 -0.39896949 -0.39498316]
# ngram_range(1, 10)[-0.41029785 -0.41504183 -0.41069979]
# При промяна на lowercase или use_idf параметрите резултата се влошава

[-0.52106595 -0.51884756 -0.51566132]


In [34]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False)),
    ('clf', MultinomialNB(alpha=0.01))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.38952164 -0.38793957 -0.38977422]


In [35]:
for alp in [0.001, 0.003, 0.01, 0.03, 0.1]:
    pipeline = Pipeline([
        ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False)),
        ('clf', MultinomialNB(alpha=alp))])

    print(alp, cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

0.001 [-0.40837162 -0.40799027 -0.41792039]
0.003 [-0.38950424 -0.38858977 -0.3955299 ]
0.01 [-0.38631528 -0.38497049 -0.38864912]
0.03 [-0.40907822 -0.40765167 -0.40820523]
0.1 [-0.479973   -0.47911609 -0.47556484]


In [36]:
for alp in [0.005, 0.007, 0.008, 0.009, 0.010, 0.011, 0.012, 0.014, 0.017, 0.020]:
    pipeline = Pipeline([
        ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False)),
        ('clf', MultinomialNB(alpha=alp))])

    print(alp, cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

0.005 [-0.38539748 -0.38427371 -0.38983334]
0.007 [-0.38479593 -0.38355383 -0.38820317]
0.008 [-0.38508742 -0.3838038  -0.38809073]
0.009 [-0.38561592 -0.38429864 -0.388265  ]
0.01 [-0.38631528 -0.38497049 -0.38864912]
0.011 [-0.38714071 -0.38577331 -0.3891907 ]
0.012 [-0.38806077 -0.38667476 -0.3898527 ]
0.014 [-0.39009993 -0.38868616 -0.39143717]
0.017 [-0.39346009 -0.39202187 -0.39422893]
0.02 [-0.3970026  -0.39555409 -0.39729895]


In [38]:
for alp in [0.0050, 0.0055, 0.0060, 0.0065, 0.0070, 0.0075, 0.0080]:
    pipeline = Pipeline([
        ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False)),
        ('clf', MultinomialNB(alpha=alp))])

    print(alp, cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

0.005 [-0.38539748 -0.38427371 -0.38983334]
0.0055 [-0.3850429  -0.3838838  -0.38918585]
0.006 [-0.38484356 -0.38365341 -0.38872016]
0.0065 [-0.38476893 -0.3835513  -0.38840141]
0.007 [-0.38479593 -0.38355383 -0.38820317]
0.0075 [-0.38490675 -0.38364277 -0.38810498]
0.008 [-0.38508742 -0.3838038  -0.38809073]


In [39]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), lowercase=False, use_idf=False)),
    ('clf', MultinomialNB(alpha=0.0065))])
pipeline = pipeline.fit(train.text, train.author)
test_predictions = pipeline.predict_proba(test.text)
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.to_csv("D:\\FMI2\\ML\\spooky-author-identification\\submit_Tfidf_MNB_text.csv")