In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [18]:
!pip install simpletransformers

In [19]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [20]:
import string
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])

def preprocess(col):
    clean = col.apply(text_process)
    hashtags = col.apply(lambda x: [word.lower() for word in x.split() if word.startswith("#")])
    return clean.to_list(), hashtags.to_list()

In [21]:
train['clean'], train['hashtags']=preprocess(train.text)
test['clean'], test['hashtags']=preprocess(test.text)

In [22]:
train.head()

In [23]:
type(train.clean)

In [24]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model = ClassificationModel('distilbert', 'distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
model.args

In [25]:
args = {'learning_rate': 4e-05,
               'overwrite_output_dir': True, 
               'num_train_epochs': 10} 

In [26]:
X_train, X_val, y_train, y_val = train_test_split(train['clean'], train['target'], test_size=0.2, random_state=42)
train_data = pd.DataFrame({'clean':X_train, 'target':y_train})
val_data = pd.DataFrame({'clean':X_val, 'target':y_val})
print(train_data.head())

In [27]:
model = ClassificationModel('distilbert', 'distilbert-base-uncased-finetuned-sst-2-english', args=args)
model.train_model(train_data)
result, outputs, wrong_predictions = model.eval_model(val_data, acc=sklearn.metrics.accuracy_score)
print(result['acc'])

In [32]:
predictions, raw_outputs = model.predict(X_val.to_list())

In [None]:
result

In [15]:
outputs

In [34]:
from sklearn.metrics import classification_report

In [35]:
from tqdm import tqdm
thresholds = np.append(np.arange(1,0.9, -0.0001), np.arange(0.9,0, -0.001))

precision_scores, recall_scores = list(), list()
for threshold in tqdm(thresholds):
    prob_preds = np.where(predictions>=threshold, 1, 0)
    temp_classification_report = classification_report(y_true=y_val, y_pred=predictions, output_dict=True)['1']
    precision = round(temp_classification_report['precision'], 3)
    precision_scores.append(precision)                      
    recall_scores.append(round(temp_classification_report['recall'], 3))

l = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]

l1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
try:
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l1[-1][0], l1[-1][1], l1[-1][2]))
except:
    print("Model doesn't reach precision of 1.00")
try:
    l3 = [item for item in l if 0.95 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l3[-1][0], l3[-1][1], l3[-1][2]))
    print("F1: {:.3f}".format(2*l3[-1][0]*l3[-1][1]/(l3[-1][0] + l3[-1][1])))
except:
    print("Model doesn't reach precision of 0.95")
try:
    l2 = [item for item in l if 0.90 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l2[-1][0], l2[-1][1], l2[-1][2]))
    print("F1: {:.3f}".format(2*l2[-1][0]*l2[-1][1]/(l2[-1][0] + l2[-1][1])))
except:print("Model doesn't reach precision of 0.90")

# Try Logistic Regression

In [64]:
# gradient boosting for classification in scikit-learn
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

model = LogisticRegression()
preprocessor = CountVectorizer()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
X_train_vec = preprocessor.fit_transform(X_train.to_list())
n_scores = cross_val_score(model, X_train_vec, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Preprocessing of validation data, get predictions
predictions = my_pipeline.predict_proba(X_val.to_list())
y_pred = my_pipeline.predict(X_val.to_list())

In [65]:
pred_prob = list(predictions[:,1])

In [57]:
model.classes_

In [66]:
thresholds = np.append(np.arange(1,0.9, -0.0001), np.arange(0.9,0, -0.001))

precision_scores, recall_scores = list(), list()
for threshold in tqdm(thresholds):
    prob_preds = np.where(pred_prob>=threshold, 1, 0)
    temp_classification_report = classification_report(y_true=y_val, y_pred=y_pred, output_dict=True)['1']
    precision = round(temp_classification_report['precision'], 3)
    precision_scores.append(precision)                      
    recall_scores.append(round(temp_classification_report['recall'], 3))
    
l = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]
l1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
try:
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l1[-1][0], l1[-1][1], l1[-1][2]))
except:
    print("Model doesn't reach precision of 1.00")
try:
    l3 = [item for item in l if 0.95 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l3[-1][0], l3[-1][1], l3[-1][2]))
    print("F1: {:.3f}".format(2*l3[-1][0]*l3[-1][1]/(l3[-1][0] + l3[-1][1])))
except:
    print("Model doesn't reach precision of 0.95")
try:
    l2 = [item for item in l if 0.90 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l2[-1][0], l2[-1][1], l2[-1][2]))
    print("F1: {:.3f}".format(2*l2[-1][0]*l2[-1][1]/(l2[-1][0] + l2[-1][1])))
except:print("Model doesn't reach precision of 0.90")

# Try LightGBM

In [67]:
# gradient boosting for classification in scikit-learn
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = GradientBoostingClassifier()
preprocessor = CountVectorizer()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
X_train_vec = preprocessor.fit_transform(X_train.to_list())
n_scores = cross_val_score(model, X_train_vec, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Preprocessing of validation data, get predictions
predictions = my_pipeline.predict_proba(X_val.to_list())
y_pred = my_pipeline.predict(X_val.to_list())
pred_prob = list(predictions[:,1])

In [68]:
thresholds = np.append(np.arange(1,0.9, -0.0001), np.arange(0.9,0, -0.001))

precision_scores, recall_scores = list(), list()
for threshold in tqdm(thresholds):
    prob_preds = np.where(pred_prob>=threshold, 1, 0)
    temp_classification_report = classification_report(y_true=y_val, y_pred=y_pred, output_dict=True)['1']
    precision = round(temp_classification_report['precision'], 3)
    precision_scores.append(precision)                      
    recall_scores.append(round(temp_classification_report['recall'], 3))
    
l = [item for item in zip(precision_scores, recall_scores, list(thresholds)) if item[0] != 0]
l1 = [i for i in zip(precision_scores, recall_scores, list(thresholds)) if i[0]==1]
try:
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l1[-1][0], l1[-1][1], l1[-1][2]))
except:
    print("Model doesn't reach precision of 1.00")
try:
    l3 = [item for item in l if 0.95 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l3[-1][0], l3[-1][1], l3[-1][2]))
    print("F1: {:.3f}".format(2*l3[-1][0]*l3[-1][1]/(l3[-1][0] + l3[-1][1])))
except:
    print("Model doesn't reach precision of 0.95")
try:
    l2 = [item for item in l if 0.90 < item[0] < 1]
    print("Comp. questions: max Prec. {:.3f} with Rec. {:.3f} at thresh. {:.6f}".format(l2[-1][0], l2[-1][1], l2[-1][2]))
    print("F1: {:.3f}".format(2*l2[-1][0]*l2[-1][1]/(l2[-1][0] + l2[-1][1])))
except:print("Model doesn't reach precision of 0.90")