In [75]:
import pandas as pd
df = pd.read_csv('train.csv', encoding='utf-8')
text_train = df['text']
author_train = df['author']

In [76]:
y_train = [{'EAP': 0, 'HPL': 1, 'MWS': 2}[y] for y in author_train]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text_train)
X_train_counts.shape

(19579, 25068)

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(19579, 25068)

In [37]:
def normalize(probs):
    return [float(p)/float(sum(probs)) for p in probs]

In [42]:
from sklearn.naive_bayes import MultinomialNB
clf_X_train, clf_X_test = X_train_tfidf[:-1958], X_train_tfidf[-1958:]
clf_y_train, clf_y_test = y_train[:-1958], y_train[-1958:]
clf = MultinomialNB().fit(clf_X_train, clf_y_train)
acc = clf.score(clf_X_test, clf_y_test)
probs = clf.predict_proba(clf_X_test)
print(acc)

0.821756894791


In [51]:
def create_submission(file_name, ids, y):
    file_text = ""
    file_text += "id,EAP,HPL,MWS\n"
    for data in zip(ids, y):
        file_text += "{},{},{},{}\n".format(data[0], data[1][0], data[1][1], data[1][2])
    submission_file = open(file_name, 'w')
    submission_file.write(file_text)
    submission_file.close()

In [70]:
submission_df = pd.read_csv('test.csv', encoding='utf-8')
submission_text_train = submission_df['text']
submission_ids = [i for i in submission_df['id']]
submission_y_train_counts = count_vect.fit_transform(submission_text_train)
submission_y_train_tfidf = tfidf_transformer.fit_transform(submission_y_train_counts)

In [71]:
y_submission = clf.predict_proba(submission_X_train_tfidf)

In [72]:
create_submission('submisison7.csv', submission_ids, y_submission)

In [59]:
y_submission.shape

(19579, 3)

In [73]:
## Support Vector Machine

In [74]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [101]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf-svm', SGDClassifier(loss='modified_huber',
                                                  penalty='l2',
                                                 alpha=1e-3,
                                                 n_iter=5,
                                                 random_state=42))])
_ = text_clf_svm.fit(df['text'][:-1958], y_train[:-1958])
val = text_clf_svm.score(df['text'][-1958:], y_train[-1958:])
predicts = text_clf_svm.predict_proba(submission_df.text)



In [102]:
print val, predicts[0]

0.805924412666 [ 0.32958867  0.09955858  0.57085275]


In [108]:
create_submission('submission8.csv', [i for i in submission_df['id']], predicts)

In [109]:
## Lets try to tune the parameters

In [116]:
# create the pipeline for the clf
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
                    ])
text_clf = text_clf.fit(df['text'][:-1958], y_train[:-1958])
val = text_clf.score(df['text'][-1958:], y_train[-1958:])
print val

0.82226762002


In [143]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'clf__alpha': (1e-2, 1e-3),
             'clf__fit_prior': (True, False)}
gs_clf = GridSearchCV(text_clf, parameters)
gs_clf_result = gs_clf.fit(df['text'][:-1958], y_train[:-1958])

In [144]:
gs_clf_result.best_score_

0.846603484478747

In [145]:
gs_clf_result.best_params_

{'clf__alpha': 0.01,
 'clf__fit_prior': True,
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2)}

In [146]:
gs_predict = gs_clf.predict_proba(submission_df.text)

In [147]:
gs_predict[:12]

array([[ 0.04150345,  0.00705363,  0.95144292],
       [ 0.93971874,  0.04910328,  0.01117798],
       [ 0.04291856,  0.93743371,  0.01964773],
       [ 0.37966663,  0.60785676,  0.01247661],
       [ 0.68312338,  0.26140214,  0.05547448],
       [ 0.9671043 ,  0.0318109 ,  0.0010848 ],
       [ 0.93736656,  0.03465893,  0.02797451],
       [ 0.02498425,  0.02326506,  0.9517507 ],
       [ 0.96792296,  0.02992997,  0.00214707],
       [ 0.91727698,  0.03409329,  0.04862973],
       [ 0.0503785 ,  0.01455906,  0.93506244],
       [ 0.01120738,  0.97828749,  0.01050513]])

In [140]:
create_submission('submission10.csv', [i for i in submission_df['id']], gs_predict)

In [137]:
import numpy as np
def tune(probs):
    for i in xrange(len(probs)):
        if probs[i] > 0.92:
            new_probs = [0.0, 0.0, 0.0]
            new_probs[i] = 1.0
            return np.array(new_probs)
    return probs

In [139]:
tuned_predict[:12]

[array([ 0.,  0.,  1.]),
 array([ 1.,  0.,  0.]),
 array([ 0.,  1.,  0.]),
 array([ 0.37966663,  0.60785676,  0.01247661]),
 array([ 0.68312338,  0.26140214,  0.05547448]),
 array([ 1.,  0.,  0.]),
 array([ 1.,  0.,  0.]),
 array([ 0.,  0.,  1.]),
 array([ 1.,  0.,  0.]),
 array([ 0.91727698,  0.03409329,  0.04862973]),
 array([ 0.,  0.,  1.]),
 array([ 0.,  1.,  0.])]