In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [13]:
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model, metrics, model_selection
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    HashingVectorizer,
)

from nltk import corpus, tokenize, stem

<IPython.core.display.Javascript object>

In [3]:
warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

# TF-IDF + Logistic Regression

In [22]:
args = dict(
    data_dir=Path("../input"),
    train_csv=Path("../input/train.csv"),
    test_csv=Path("../input/test_x.csv"),
    sample_csv=Path("../input/sample_submission.csv"),
    target_col="author",
    feature_dir=Path("../build/feature"),
    val_dir=Path("../build/val"),
    test_dir=Path("../build/test"),
    subm_dir=Path("../build/subm"),
    n_folds=5,
    n_classes=5,
    seed=42,
)

<IPython.core.display.Javascript object>

In [33]:
algo_name = "lr"
feature_name = "tfidf"
model_name = f"{algo_name}_{feature_name}"

feature_file = args["feature_dir"] / f"{feature_name}.csv"
p_val_file = args["val_dir"] / f"{model_name}.csv"
p_test_file = args["test_dir"] / f"{model_name}.csv"
subm_file = args["subm_dir"] / f"{model_name}.csv"

<IPython.core.display.Javascript object>

## Data

In [5]:
trn = pd.read_csv(args["train_csv"], index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


<IPython.core.display.Javascript object>

In [6]:
test = pd.read_csv(args["test_csv"], index_col=0)
print(test.shape)
test.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


<IPython.core.display.Javascript object>

In [34]:
# args["feature_dir"].mkdir(exist_ok=True)
# args["val_dir"].mkdir(exist_ok=True)
# args["test_dir"].mkdir(exist_ok=True)
# args["subm_dir"].mkdir(exist_ok=True)

<IPython.core.display.Javascript object>

## NLTK

In [8]:
s = trn.text[4]
s

'“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”'

<IPython.core.display.Javascript object>

In [10]:
tokens = tokenize.word_tokenize(s)
print(tokens)

['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '...', '.', 'Oh', ',', 'my', 'God', '!', '”']


<IPython.core.display.Javascript object>

In [11]:
lemmatizer = stem.WordNetLemmatizer()
print([lemmatizer.lemmatize(t) for t in tokens])

['“', 'Have', 'mercy', ',', 'gentleman', '!', '”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wound', 'in', 'both', 'half', '...', '.', 'Oh', ',', 'my', 'God', '!', '”']


<IPython.core.display.Javascript object>

In [12]:
stemmer = stem.snowball.SnowballStemmer("english")
print([stemmer.stem(t) for t in tokens])

['“', 'have', 'merci', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'here', 'i', '’', 've', 'torn', 'my', 'heart', 'asund', 'befor', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', '...', '.', 'oh', ',', 'my', 'god', '!', '”']


<IPython.core.display.Javascript object>

## Bag-of-Words features

In [14]:
veczr = CountVectorizer(
    tokenizer=tokenize.word_tokenize,
    stop_words=corpus.stopwords.words("english"),
    ngram_range=(1, 2),
    min_df=100,
)
x_cnt = veczr.fit_transform(trn["text"])
print(x_cnt.shape)

(54879, 2683)


<IPython.core.display.Javascript object>

In [16]:
x_cnt[0, :50].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

<IPython.core.display.Javascript object>

In [20]:
veczr = TfidfVectorizer(
    tokenizer=tokenize.word_tokenize,
    stop_words=corpus.stopwords.words("english"),
    ngram_range=(1, 3),
    min_df=50,
)
x = veczr.fit_transform(trn["text"])
x_test = veczr.transform(test["text"])
x.shape, x_test.shape

((54879, 5899), (19617, 5899))

<IPython.core.display.Javascript object>

In [21]:
x[0, :50].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

<IPython.core.display.Javascript object>

## Logistic Regression

In [23]:
cv = model_selection.StratifiedKFold(
    n_splits=args["n_folds"], shuffle=True, random_state=args["seed"]
)

<IPython.core.display.Javascript object>

In [24]:
y = trn.author.values
y.shape

(54879,)

<IPython.core.display.Javascript object>

In [30]:
p = np.zeros((x.shape[0], args["n_classes"]))
p_test = np.zeros((x_test.shape[0], args["n_classes"]))
for fold, (trn_, val_) in enumerate(cv.split(x, y)):
    clf = linear_model.LogisticRegression()
    x_trn, y_trn = x[trn_], y[trn_]
    x_val, y_val = x[val_], y[val_]
    clf.fit(x_trn, y_trn)
    p[val_] = clf.predict_proba(x_val)
    p_test += clf.predict_proba(x_test) / args["n_classes"]

<IPython.core.display.Javascript object>

In [32]:
accuracy = metrics.accuracy_score(y, np.argmax(p, axis=-1)) * 100
log_loss = metrics.log_loss(pd.get_dummies(y), p)
print(f"Accuracy (CV): {accuracy:8.4f}")
print(f"Log Loss (CV): {log_loss:8.4f}")

Accuracy (CV):  76.6687
Log Loss (CV):   0.6771


<IPython.core.display.Javascript object>

In [35]:
np.savetxt(p_val_file, p, fmt="%.6f", delimiter=",")
np.savetxt(p_test_file, p_test, fmt="%.6f", delimiter=",")

<IPython.core.display.Javascript object>

## Submission

In [36]:
subm = pd.read_csv(args["sample_csv"], index_col=0)
print(subm.shape)
subm.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


<IPython.core.display.Javascript object>

In [37]:
subm[subm.columns] = p_test
subm.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.059862,0.534297,0.318841,0.062232,0.024768
1,0.081916,0.81877,0.003099,0.027128,0.069086
2,0.719958,0.031354,0.117336,0.037272,0.094081
3,0.036079,0.003477,0.853268,0.00545,0.101727
4,0.302947,0.241325,0.146827,0.189554,0.119348


<IPython.core.display.Javascript object>

In [39]:
subm.to_csv(subm_file)

<IPython.core.display.Javascript object>