In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
import string
from nltk.corpus import stopwords

In [13]:
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [14]:
!pip install simpletransformers

In [15]:
#read the filea
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [16]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])

#maybe use later (or delete)
def preprocess(col):
    clean = col.apply(text_process)
    hashtags = col.apply(lambda x: [word.lower() for word in x.split() if word.startswith("#")])
    return clean.to_list(), hashtags.to_list()

In [17]:
#preprocess the text in train and test
train['clean'] = train['text'].apply(text_process)
test['clean'] = test['text'].apply(text_process)

In [18]:
train.head()

# Function to find better thresholds for predictions

In [40]:
from sklearn.metrics import classification_report
from tqdm import tqdm

#arguments: current model, targets of the val dataset (y_val) and predictions for the val dataset (predictions)
def precision_recall(mod, y_val, predictions):
    
    print(mod)
    
    #create a list of thresholds for probabilities: [1, 0.9999, 0.9998, ..., 0.002, 0.001]
    #approxim. thresholds between 1 and 0.001 with the step of -0.0001
    thresholds = np.append(np.arange(1,0.9, -0.0001), np.arange(0.9,0, -0.001))
    
    #create lists to save precision and recall for each further iteration
    precision_scores, recall_scores = list(), list()
    
    #iterate over generated thresholds
    for threshold in tqdm(thresholds):
        #create list of rounded predictions: if prediction>=current threshold, set it to 1, otherwise to 0
        prob_preds = np.where(predictions>=threshold, 1, 0)
        #print(threshold)
        #print("prob_preds: \n",prob_preds)
        #print("y_val: \n", y_val)
        
        #check precision and recall for the new rounded predictions
        temp_classification_report = classification_report(y_true=y_val, y_pred=prob_preds, output_dict=True)['1']
        
        #save precision and recall for the new predictions to the previously created lists
        precision = round(temp_classification_report['precision'], 3)
        precision_scores.append(precision)                      
        recall_scores.append(round(temp_classification_report['recall'], 3))

    #res is list of tuples (precision, recall, threshold) from resulted lists, where precision>0    
    res = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]
    print("Last 10 tuples: ", res[-10:])
    
    #res_precision1 is list of tuples (precision, recall, threshold) where precision==1
    res_precision1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
    print("Last 10 tuples for precision=1: ", res_precision1[-10:])
    
    try:
        #take the last element of res_precision1, which contains min threshold for precision 1
        print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(res_precision1[-1][0], res_precision1[-1][1], res_precision1[-1][2]))
    except:
        #if res_precision1 is empty
        print("Model doesn't reach precision of 1.00")
    try:
        #res_precision095 is list of (precision, recall, threshold) tuples for precision between 0.95 and 1
        res_precision095 = [item for item in res if 0.95 < item[0] < 1]
        print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(res_precision095[-1][0], res_precision095[-1][1], res_precision095[-1][2]))
        print("F1: {:.3f}".format(2*res_precision095[-1][0]*res_precision095[-1][1]/(res_precision095[-1][0] + res_precision095[-1][1])))
    except:
        print("Model doesn't reach precision of 0.95")
    try:
        #for precision between 0.9 and 1
        res_precision09 = [item for item in l if 0.90 < item[0] < 1]
        print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(res_precision09[-1][0], res_precision09[-1][1], res_precision09[-1][2]))
        print("F1: {:.3f}".format(2*res_precision09[-1][0]*res_precision09[-1][1]/(res_precision09[-1][0] + res_precision09[-1][1])))
    except:print("Model doesn't reach precision of 0.90")

# Try Distilbert

In [22]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model = ClassificationModel('distilbert', 'distilbert-base-uncased-finetuned-sst-2-english')

In [23]:
model.args

In [24]:
args = {'learning_rate': 4e-05,
               'overwrite_output_dir': True, 
               'num_train_epochs': 10} 

In [25]:
X_train, X_val, y_train, y_val = train_test_split(train['clean'], train['target'], test_size=0.2, random_state=42)
train_data = pd.DataFrame({'clean':X_train, 'target':y_train})
val_data = pd.DataFrame({'clean':X_val, 'target':y_val})
print(train_data.head())

In [26]:
model = ClassificationModel('distilbert', 'distilbert-base-uncased-finetuned-sst-2-english', args=args)
model.train_model(train_data)
result, outputs, wrong_predictions = model.eval_model(val_data, acc=sklearn.metrics.accuracy_score)
print(result['acc'])

In [27]:
predictions, raw_outputs = model.predict(X_val.to_list())

In [28]:
result

In [29]:
outputs

In [30]:
precision_recall("DistilBERT", y_val, predictions)

# Try Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
preprocessor = CountVectorizer()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
X_train_vec = preprocessor.fit_transform(X_train.to_list())
n_scores = cross_val_score(model, X_train_vec, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Preprocessing of validation data, get predictions
predictions = my_pipeline.predict_proba(X_val.to_list())
y_pred = my_pipeline.predict(X_val.to_list())

In [32]:
predictions

In [41]:
pred_prob = list(predictions[:,1])
precision_recall("Logistic Regression", y_val.to_list(), pred_prob)

# Try LightGBM

In [42]:
# gradient boosting for classification in scikit-learn
from sklearn.ensemble import GradientBoostingClassifier


model = GradientBoostingClassifier()
preprocessor = CountVectorizer()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
X_train_vec = preprocessor.fit_transform(X_train.to_list())
n_scores = cross_val_score(model, X_train_vec, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Preprocessing of validation data, get predictions
predictions = my_pipeline.predict_proba(X_val.to_list())
y_pred = my_pipeline.predict(X_val.to_list())
pred_prob = list(predictions[:,1])

In [43]:
precision_recall("Light GBM", y_val, pred_prob)

# Create an ensemble based on selected thresholds for each model

In [57]:
def ensemble_preparation(mod, threshold):
    if mod == 'logistic':
        model = LogisticRegression()
    else:
        model = GradientBoostingClassifier()
        
    preprocessor = CountVectorizer()
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                                 ])

    # Preprocessing of training data, fit model 
    my_pipeline.fit(X_train, y_train)

    # Get predictions for test set
    test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
    X_test = test['text'].to_list()
    predictions = my_pipeline.predict_proba(X_test)
    pred = predictions[:,1]
    
    #define predictions for a model based on the chosen threshold
    prob_preds = np.where(pred>=threshold, 1, 0)
    return prob_preds

In [58]:
#pick threshold for logistic regression for precision 0.95
preds_logistic = ensemble_preparation("logistic", 0.802)

#pick threshold for LightGBM for precision 0.95
preds_lightGBM = ensemble_preparation("lightGBM", 0.644)

In [61]:
#redefine predictions based on thresholds for both models
predictions = list()
for x, y in zip(preds_logistic, preds_lightGBM):
    
    #if both models agree on the prediction, pick this prediction
    if x==y:
        predictions.append(x)
        
    #if one of the models predicted 1, pick this predictions
    elif x==1 or y==1:
        predictions.append(1)

# Submit results

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = predictions
sample_submission.to_csv("submission.csv", index=False)