### Reference 
https://www.kaggle.com/mabrek/simple-fasttext-pretrained

In [1]:
import warnings
import traceback
import sys
from datetime import datetime
import json

import numpy as np
import pandas as pd
from timeit import default_timer as timer

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, log_loss, average_precision_score
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint as randint
from scipy.stats import uniform as uniform
from sklearn.utils import check_random_state

import fastText

In [2]:
PUNCTS_FASTTEXT = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '{', '}', '©', '^', '®',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


def clean_fasttext(x):
    x = str(x)
    for punct in PUNCTS_FASTTEXT:
        x = x.replace(punct, f' {punct} ')
    return x


In [3]:
def predict_fasttext_single(model, x):
    labels, probs = model.predict(x, 2)
    if labels[0] == '__label__1':
        return probs[0]
    else:
        return probs[1]


def predict_fasttext(model, df):
    return df.cleaned_text.apply(lambda x: predict_fasttext_single(model, x))


# from https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/76391
def scoring(y_true, y_proba, verbose=True):
    from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
    from sklearn.model_selection import RepeatedStratifiedKFold

    def threshold_search(y_true, y_proba):
        precision , recall, thresholds = precision_recall_curve(y_true, y_proba)
        thresholds = np.append(thresholds, 1.001) 
        F = 2 / (1/precision + 1/recall)
        best_score = np.max(F)
        best_th = thresholds[np.argmax(F)]
        return best_th 


    rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

    scores = []
    ths = []
    for train_index, test_index in rkf.split(y_true, y_true):
        y_prob_train, y_prob_test = y_proba[train_index], y_proba[test_index]
        y_true_train, y_true_test = y_true[train_index], y_true[test_index]

        # determine best threshold on 'train' part 
        best_threshold = threshold_search(y_true_train, y_prob_train)

        # use this threshold on 'test' part for score 
        sc = f1_score(y_true_test, (y_prob_test >= best_threshold).astype(int))
        scores.append(sc)
        ths.append(best_threshold)

    best_th = np.mean(ths)
    score = np.mean(scores)

    if verbose: print(f'Best threshold: {np.round(best_th, 4)}, Score: {np.round(score,5)}')

    return best_th, score

In [4]:
!mkdir -p ../tmp

In [5]:
train = pd.read_csv("../input/train.csv").sample(frac=1, random_state=3465).reset_index(drop=True)

train['cleaned_text'] = train["question_text"].apply(lambda x: clean_fasttext(x)).str.replace('\n', ' ')
fasttext_labeled = '__label__' + train.target.astype(str) + ' ' + train.cleaned_text

np.savetxt('../tmp/train.txt', fasttext_labeled.values, fmt='%s')


test = pd.read_csv("../input/test.csv", index_col='qid')
test['cleaned_text'] = test["question_text"].apply(lambda x: clean_fasttext(x)).str.replace('\n', ' ')

In [6]:
parameters = {
        'lr': 0.161195,
        'dim': 300,
        'ws': 5,
        'epoch': 10,
        'minCount': 80,
        'minCountLabel': 0,
        'minn': 4,
        'maxn': 5,
        'neg': 5,
        'wordNgrams': 3,
        'loss': "hs",
        'bucket': 2000000,
        'thread': 4,
        'lrUpdateRate': 100,
        't': 1e-4,
        'pretrainedVectors': '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',
        'verbose': 0
    }

In [7]:
model = fastText.train_supervised(input='../tmp/train.txt', **parameters)

In [8]:
train_pred = predict_fasttext(model, train)
test_pred = predict_fasttext(model, test)

In [9]:
best_th, f1 = scoring(train.target, train_pred, verbose=False)



In [10]:
best_th, f1

(0.40829826772212985, 0.9345048800263016)

In [11]:
pred = (test_pred > best_th).astype('int').rename('prediction')

In [12]:
pd.DataFrame(pred).to_csv('submission.csv')