In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f'Shape of train : {train.shape}')
print(f'Shape of test : {test.shape}')

Shape of train : (11982, 2)
Shape of test : (2996, 2)


In [3]:
#Label target values
label = {
    'Not Clickbait' : 0,
    'Clickbait' : 1
}

train['Label'] = train['Label'].replace(label)
test['Label'] = test['Label'].replace(label)

Text Preprocessing

In [4]:
import string

def preprocess(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text

train['Headline'] = train['Headline'].apply(lambda x : preprocess(x))
test['Headline'] = test['Headline'].apply(lambda x : preprocess(x))

In [5]:
text = pd.concat([train['Headline'], test['Headline']]).reset_index(drop=True)
text.shape

(14978,)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(text)
X_train = vectorizer.transform(train['Headline'])
X_test = vectorizer.transform(test['Headline'])

In [7]:
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

models = {
    'svm' : SVC(C=100, gamma=1, probability=True),
    'bnb' : BernoulliNB(binarize=0.01),
}

def trainModel(model, X_train, y_train, X_test, y_test):
    modelScores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        modelScores[name] = accuracy_score(y_test, pred)
    return modelScores
modelScores = trainModel(model=models, X_train=X_train, y_train=train['Label'], X_test=X_test, y_test=test['Label'])

scores = pd.DataFrame(modelScores.items(), columns=['model', 'score'])
scores

Unnamed: 0,model,score
0,svm,0.768024
1,bnb,0.757677
