In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import requests
import json

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv("clickbait_data.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headline   32000 non-null  object
 1   clickbait  32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 500.1+ KB


In [None]:
df.clickbait.value_counts()

0    16001
1    15999
Name: clickbait, dtype: int64

In [None]:
df.sample(10)

Unnamed: 0,headline,clickbait
17084,'Criminal in uniform': Senior London policeman...,0
19137,Johnson ousts Livingstone in London mayoral el...,0
28384,Serbia to apply to join the EU,0
19451,Venezuelan economy shrank by 5.8% in fourth qu...,0
25708,Obama Poll Sees Doubt on Budget and Health Care,0
7092,36 Crazy Gifts That Any Miyazaki Lover Will Go...,1
21426,'Guantanamo'-style detention facility under co...,0
13102,This New Adele Song Is Going To Slay Your Enti...,1
6319,"One Woman, Three Tim Burton Characters",1
14555,18 Confessions That Prove You're Not Alone In ...,1


In [None]:
response = requests.get('https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.json')
corpus_stopword_external = json.loads(response.text)


def clean_text(text):
    #Ubah text menjadi huruf kecil saja
    text = text.lower()
    #Membuang semua kata yang mengandung # ataupun @
    text = re.sub('(@\w+|#\w+)','',text)
    #Membuang semua non-alphabetic text
    text = re.sub('[^a-zA-Z]',' ',text)
    #Mengganti newline dengan spasi, sebab text sangat banyak newline tidak terduga
    text = re.sub("\n"," ",text)
    #Mengganti double space menjadi satu. Ini untuk merapikan teks akibat newline di atas
    text = re.sub('(s{2,})',' ',text)
    #Pisahkan text menjadi token
    token_text = word_tokenize(text)

    token_text_without_sw = [word for word in token_text if not word in stopwords.words() and not word in corpus_stopword_external]

    # ps = PorterStemmer()
    # token_text_without_sw_stemmed = [ps.stem(word) for word in token_text_without_sw]

    final_text = ' '.join(token_text_without_sw)

    return final_text

In [None]:
df['clean_text'] = df['headline'].apply(clean_text)

In [None]:
df['clean_text']

0                                                    bings
1                                     female friend belong
2                   star wars force awakens trailer chills
3              vine york celebrity brother fucking perfect
4        couple stunning photo shoot baby learning inop...
                               ...                        
31995                female hearts flutter iraq throw shoe
31996         british liberal democrat patsy calton cancer
31997    drone smartphone app heart attack victims remo...
31998    netanyahu urges pope benedict israel denounce ...
31999             makers prepare stake bigger claim phones
Name: clean_text, Length: 32000, dtype: object

In [None]:
df.to_csv("clickbait_data_clean.csv")

In [None]:
df_clean = pd.read_csv("clickbait_data_clean.csv")

In [None]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  32000 non-null  int64 
 1   headline    32000 non-null  object
 2   clickbait   32000 non-null  int64 
 3   clean_text  31924 non-null  object
dtypes: int64(2), object(2)
memory usage: 1000.1+ KB


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_temp = train_test_split(df_clean, test_size=0.2, stratify=df_clean['clickbait'], random_state=42)
df_valid, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['clickbait'], random_state=42)

In [None]:
df_clean.clickbait.value_counts()

0    16001
1    15999
Name: clickbait, dtype: int64

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25600 entries, 1557 to 7491
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25600 non-null  int64 
 1   headline    25600 non-null  object
 2   clickbait   25600 non-null  int64 
 3   clean_text  25545 non-null  object
dtypes: int64(2), object(2)
memory usage: 1000.0+ KB


In [None]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 11123 to 26341
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3200 non-null   int64 
 1   headline    3200 non-null   object
 2   clickbait   3200 non-null   int64 
 3   clean_text  3184 non-null   object
dtypes: int64(2), object(2)
memory usage: 125.0+ KB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 3756 to 18064
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3200 non-null   int64 
 1   headline    3200 non-null   object
 2   clickbait   3200 non-null   int64 
 3   clean_text  3195 non-null   object
dtypes: int64(2), object(2)
memory usage: 125.0+ KB


In [None]:
df_train.clickbait.value_counts()

0    12801
1    12799
Name: clickbait, dtype: int64

In [None]:
df_valid.clickbait.value_counts()

1    1600
0    1600
Name: clickbait, dtype: int64

In [None]:
df_test.clickbait.value_counts()

1    1600
0    1600
Name: clickbait, dtype: int64

In [None]:
df_train.to_csv("df_train.csv")
df_valid.to_csv("df_valid.csv")
df_test.to_csv("df_test.csv")