In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from nltk.corpus import stopwords
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/clickbait-project/validation.csv
/kaggle/input/clickbait-project/train.csv
/kaggle/input/clickbait-project/test.csv


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from xgboost import XGBClassifier


In [4]:
stopwords = set(stopwords.words('english'))
#https://catriscode.com/2021/05/01/tweets-cleaning-with-python/
def clean_data(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp

In [5]:
#https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def deEmojify(text):
#     print(text)
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [6]:
train_data = pd.read_csv('/kaggle/input/clickbait-project/train.csv')
val_data = pd.read_csv('/kaggle/input/clickbait-project/validation.csv')
test_data = pd.read_csv('/kaggle/input/clickbait-project/test.csv')
train_data = train_data.dropna()
val_data = val_data.dropna()
test_data = test_data.dropna()

In [7]:
train_data.head()

Unnamed: 0,postText,truthClass
0,UK’s response to modern slavery leaving victim...,no-clickbait
1,this is good,clickbait
2,"The ""forgotten"" Trump roast: Relive his brutal...",no-clickbait
3,Meet the happiest #dog in the world!,clickbait
4,Tokyo's subway is shut down amid fears over an...,no-clickbait


In [8]:
train_x = train_data['postText'].tolist()
train_y = train_data['truthClass'].tolist()
val_x = val_data['postText'].tolist()
val_y = val_data['truthClass'].tolist()
test_x = test_data['postText'].tolist()
test_y = test_data['truthClass'].tolist()

In [9]:
train_x = list(map(clean_data, list(map(deEmojify, train_x))))
val_x = list(map(clean_data, list(map(deEmojify, val_x))))
test_x = list(map(clean_data, list(map(deEmojify, test_x))))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


In [10]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(train_x)
test_vectors = vectorizer.transform(test_x)
val_vectors = vectorizer.transform(val_x)

In [11]:
classifier_linear = svm.SVC(kernel='rbf')
classifier_linear.fit(train_vectors, train_y)


SVC()

In [12]:
pred_test = classifier_linear.predict(test_vectors)
pred_val = classifier_linear.predict(val_vectors)

In [13]:
report_test = classification_report(test_y, pred_test, output_dict=True)
report_val = classification_report(val_y, pred_val, output_dict=True)

In [14]:
report_test

{'clickbait': {'precision': 0.7538699690402477,
  'recall': 0.32772543741588156,
  'f1-score': 0.45684803001876173,
  'support': 4458},
 'no-clickbait': {'precision': 0.8234462444771723,
  'recall': 0.9670010377032169,
  'f1-score': 0.889468660515431,
  'support': 14455},
 'accuracy': 0.8163168191191245,
 'macro avg': {'precision': 0.78865810675871,
  'recall': 0.6473632375595493,
  'f1-score': 0.6731583452670964,
  'support': 18913},
 'weighted avg': {'precision': 0.8070463589012293,
  'recall': 0.8163168191191245,
  'f1-score': 0.7874952680999416,
  'support': 18913}}

In [15]:
report_val

{'clickbait': {'precision': 0.7560975609756098,
  'recall': 0.16272965879265092,
  'f1-score': 0.2678185745140389,
  'support': 762},
 'no-clickbait': {'precision': 0.7220043572984749,
  'recall': 0.9764289923394225,
  'f1-score': 0.8301603206412825,
  'support': 1697},
 'accuracy': 0.7242781618544124,
 'macro avg': {'precision': 0.7390509591370423,
  'recall': 0.5695793255660367,
  'f1-score': 0.5489894475776607,
  'support': 2459},
 'weighted avg': {'precision': 0.7325692296864279,
  'recall': 0.7242781618544124,
  'f1-score': 0.6559006986205588,
  'support': 2459}}

In [16]:
def label_map(item):
    if item=='no-clickbait':
        return 1
    return 0

In [17]:
new_pred_val = list(map(label_map, pred_val))
new_pred_test = list(map(label_map, pred_test))
new_test_y = list(map(label_map, test_y))
new_train_y = list(map(label_map, train_y))
new_val_y = list(map(label_map, val_y))

In [27]:
f1_score(new_test_y, new_pred_test), f1_score(new_val_y, new_pred_val)

(0.889468660515431, 0.8301603206412825)

In [20]:
xgb_clf = XGBClassifier(n_estimators=150, max_depth=3, objective='binary:logistic')

In [21]:
xgb_clf.fit(train_vectors, new_train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=150,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [22]:
test_pred_xgb = xgb_clf.predict(test_vectors)
val_pred_xgb = xgb_clf.predict(val_vectors)

In [23]:
classifiaction_report_test_xgb = classification_report(new_test_y, test_pred_xgb, target_names = ['non-clickbait', 'clickbait'], output_dict=True)
classifiaction_report_val_xgb = classification_report(new_val_y, val_pred_xgb, target_names = ['non-clickbait', 'clickbait'], output_dict=True)

In [24]:
classifiaction_report_test_xgb

{'non-clickbait': {'precision': 0.7043795620437956,
  'recall': 0.1731718259309107,
  'f1-score': 0.2779978393950306,
  'support': 4458},
 'clickbait': {'precision': 0.7931189313576921,
  'recall': 0.9775856105153926,
  'f1-score': 0.8757436787307883,
  'support': 14455},
 'accuracy': 0.7879765240839635,
 'macro avg': {'precision': 0.7487492467007438,
  'recall': 0.5753787182231517,
  'f1-score': 0.5768707590629094,
  'support': 18913},
 'weighted avg': {'precision': 0.7722020959322519,
  'recall': 0.7879765240839635,
  'f1-score': 0.7348484769246862,
  'support': 18913}}

In [25]:
classifiaction_report_val_xgb

{'non-clickbait': {'precision': 0.7345132743362832,
  'recall': 0.1089238845144357,
  'f1-score': 0.18971428571428572,
  'support': 762},
 'clickbait': {'precision': 0.7105711849957375,
  'recall': 0.9823217442545669,
  'f1-score': 0.8246351719020529,
  'support': 1697},
 'accuracy': 0.7116714111427409,
 'macro avg': {'precision': 0.7225422296660103,
  'recall': 0.5456228143845013,
  'f1-score': 0.5071747288081694,
  'support': 2459},
 'weighted avg': {'precision': 0.7179904091020798,
  'recall': 0.7116714111427409,
  'f1-score': 0.6278845760195485,
  'support': 2459}}

In [26]:
f1_score(new_test_y, test_pred_xgb), f1_score(new_val_y, val_pred_xgb)

(0.8757436787307883, 0.8246351719020529)