# Packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, re
from pprint import pprint
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

# Data

In [4]:
train = pd.read_csv('train.csv').iloc[:, :-1]
test = pd.read_csv('test.csv').iloc[:, :-1]
submission = pd.read_csv('data_info_val_sample_submission.csv')
with open('categories.json') as f:
    categories = json.load(f)

# Modelling

In [5]:
def count_classifier(x_train, y_train, x_test, y_test, clf):
    count_vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_train_vec, x_test_vec = count_vectorizer.fit_transform(x_train), count_vectorizer.transform(x_test)
    x_train_vec, x_test_vec = tfidf_transformer.fit_transform(x_train_vec), tfidf_transformer.transform(x_test_vec)
    clf.fit(x_train_vec, y_train)
    return clf.score(x_test_vec, y_test)

In [6]:
x, y = train['title'].values, train['Category'].values
skf = StratifiedKFold(n_splits=10)

In [7]:
for theta in range(1, 6):
    theta /= 100
    clf = MultinomialNB(alpha=theta)
    scores = []
    for train_indices, test_indices in skf.split(x, y):
        x_train, y_train = x[train_indices], y[train_indices]
        x_test, y_test = x[test_indices], y[test_indices]
        score = count_classifier(x_train, y_train, x_test, y_test, clf)
        scores.append(score)
    avg = np.mean(scores)
    print('Theta: %s - Score : %s' % (theta, avg))

Theta: 0.01 - Score : 0.6733285400985177
Theta: 0.02 - Score : 0.6732820086621454
Theta: 0.03 - Score : 0.6728905012051252
Theta: 0.04 - Score : 0.6723024803774886
Theta: 0.05 - Score : 0.6714264163906931


In [8]:
for theta in range(1, 6):
    theta /= 100
    clf = SGDClassifier(loss='hinge', penalty='l2', alpha=theta, random_state=2019)
    scores = []
    for train_indices, test_indices in skf.split(x, y):
        x_train, y_train = x[train_indices], y[train_indices]
        x_test, y_test = x[test_indices], y[test_indices]
        score = count_classifier(x_train, y_train, x_test, y_test, clf)
        scores.append(score)
    avg = np.mean(scores)
    print('Theta: %s - Score : %s' % (theta, avg))



Theta: 0.01 - Score : 0.6483681577638147




Theta: 0.02 - Score : 0.6482721713382025




Theta: 0.03 - Score : 0.6481791453079404




Theta: 0.04 - Score : 0.6481281514053875




Theta: 0.05 - Score : 0.6479616400777128
