# visualize sarcasm headlines dataset

In [None]:
%matplotlib inline
import pandas as pd

df = pd.read_json('./Sarcasm_Headlines_Dataset.json', lines=True)
#df.to_csv('./headlines.csv', index=False)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.head()

In [None]:
df.hist()

In [None]:
df['is_sarcastic'].value_counts().plot(kind='pie', autopct='%.2f%%', explode=[0.05, 0])

# 1 : sarcastic
# 0 : not sarcastic

In [None]:
#df = df.drop(['article_link'], axis=1)
df.drop_duplicates(subset=['headline'], inplace = True)

# pre-process dataset [clean stemmwords, remove stopwords, etc.]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from unidecode import unidecode
import collections
import re
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
origin = df['headline']
df['headline'] = df['headline'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower()))
df['headline'] = df['headline'].apply(lambda x: re.sub('\s+', ' ', x).strip())

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

df['headline'] = df['headline'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x.lower()) if word not in stop_words]))
df['headline']

In [None]:
import matplotlib.pyplot as plt

In [None]:
words = []
for text in origin:
    words.extend(text.split())
word_count = collections.Counter(words)
top_words = dict(word_count.most_common(10))
plt.figure(figsize = (10, 8))
plt.bar(range(len(top_words)), list(top_words.values()), align='center')
plt.xticks(range(len(top_words)), list(top_words.keys()))
plt.grid(alpha = 0.5)
plt.title('ten most used words before cleaning', fontsize = 18)
plt.xlabel('words')
plt.ylabel('count')
plt.show()

In [None]:
words = []
for text in df['headline']:
    words.extend(text.split())
word_count = collections.Counter(words)
top_words = dict(word_count.most_common(10))
plt.figure(figsize = (10, 8))
plt.bar(range(len(top_words)), list(top_words.values()), align='center')
plt.xticks(range(len(top_words)), list(top_words.keys()))
plt.grid(alpha = 0.5)
plt.title('ten most used words after cleaning', fontsize = 18)
plt.xlabel('words')
plt.ylabel('count')
plt.show()

In [None]:
#df.to_csv('./headlines.csv', index=False)
df.to_json('./headlines.json', index=False)

# split data

In [None]:
from fast_ml.model_development import train_valid_test_split

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'is_sarcastic', train_size=0.8, valid_size=0.1, test_size=0.1)

In [None]:
df_train = pd.DataFrame().assign(headline=X_train, is_sarcastic=y_train)
df_train.head()

In [None]:
df_test = pd.DataFrame().assign(headline=X_test, is_sarcastic=y_test)
df_test.head()

In [None]:
df_valid = pd.DataFrame().assign(headline=X_valid, is_sarcastic=y_valid)
df_valid.head()

In [None]:
df_train.to_csv('./headlines_train.csv', index=False)
df_test.to_csv('./headlines_test.csv', index=False)
df_valid.to_csv('./headlines_valid.csv', index=False)