### Libraries

In [1]:
import fasttext
import numpy as np
from sklearn.metrics import roc_auc_score
import pandas as pd

### Run fasttext training on preprocessed data

In [2]:
cls = fasttext.supervised("./train.txt", "./model")

### Try to predict labels and predict probabilities

In [3]:
texts = ['fuck you', 'we are fucking amazing guys']
labels = cls.predict(texts)
print (labels)

# Or with the probability
labels = cls.predict_proba(texts)
print (labels)

[[u'1'], [u'0']]
[[(u'1', 0.96875)], [(u'0', 0.742188)]]


### Run fasttext on validation preprocessed data tuning parameters

In [4]:
for lr_ in [0.01, 0.1, 1.]:
    for epoch_ in [10, 20, 50, 100]:
        print ('Learning rate = %f, Number of epochs = %d' % (lr_, epoch_))
        cls = fasttext.supervised("./train.txt", "./model", lr=lr_, epoch=epoch_, loss='softmax')

        labels = []
        true_labels = []
        with open('./valid.txt', 'r') as f:
            for line in f:
                true_labels.append(int(line[9]))
                labels.append(cls.predict([line[11:]]))

        labels_ = np.concatenate(labels)
        labels_ = np.concatenate(labels_)
        labels = np.asarray(labels_).astype('int')
        true_labels = np.asarray(true_labels)
        print 'ROC_AUC_SCORE:',roc_auc_score(true_labels, labels)

Learning rate = 0.010000, Number of epochs = 10
ROC_AUC_SCORE: 0.52732471668
Learning rate = 0.010000, Number of epochs = 20
ROC_AUC_SCORE: 0.666561432426
Learning rate = 0.010000, Number of epochs = 50
ROC_AUC_SCORE: 0.748745312461
Learning rate = 0.010000, Number of epochs = 100
ROC_AUC_SCORE: 0.760897467141
Learning rate = 0.100000, Number of epochs = 10
ROC_AUC_SCORE: 0.758917586451
Learning rate = 0.100000, Number of epochs = 20
ROC_AUC_SCORE: 0.770209774304
Learning rate = 0.100000, Number of epochs = 50
ROC_AUC_SCORE: 0.768212908438
Learning rate = 0.100000, Number of epochs = 100
ROC_AUC_SCORE: 0.761463516581
Learning rate = 1.000000, Number of epochs = 10
ROC_AUC_SCORE: 0.769446179886
Learning rate = 1.000000, Number of epochs = 20
ROC_AUC_SCORE: 0.760695860491
Learning rate = 1.000000, Number of epochs = 50
ROC_AUC_SCORE: 0.757625236131
Learning rate = 1.000000, Number of epochs = 100
ROC_AUC_SCORE: 0.76041892828


### Run fasttext on test preprocessed data for the best parameters

In [6]:
cls = fasttext.supervised("./train.txt", "./model", lr=0.1, epoch=20, loss='softmax')

labels = []
with open('./test.txt', 'r') as f:
    for line in f:
        labels.append(cls.predict([line[:-1]]))

labels_ = np.concatenate(labels)
labels_ = np.concatenate(labels_)
test_labels = np.asarray(labels_).astype('int')

### Write the result to the required submission form

In [7]:
filename = './test.csv'
test_data_ = pd.read_csv(filename)
test_data_.head()

Unnamed: 0,id,Date,Comment
0,1,20120603163526Z,"""like this if you are a tribe fan"""
1,2,20120531215447Z,"""you're idiot......................."""
2,3,20120823164228Z,"""I am a woman Babs, and the only ""war on women..."
3,4,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F..."
4,5,20120602223825Z,"""haha green me red you now loser whos winning ..."


In [8]:
test_data_ = test_data_.drop('id', axis=1)
test_data_.insert(loc=0, column='Insult', value=test_labels)
test_data_.head()

Unnamed: 0,Insult,Date,Comment
0,0,20120603163526Z,"""like this if you are a tribe fan"""
1,1,20120531215447Z,"""you're idiot......................."""
2,0,20120823164228Z,"""I am a woman Babs, and the only ""war on women..."
3,0,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F..."
4,1,20120602223825Z,"""haha green me red you now loser whos winning ..."


In [9]:
test_data_.to_csv('./insult_subm_v3.csv', index=False)