In [189]:
# Importing required libraries
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from contractions import fix

# Reading train and test csv files
df_train = pd.read_csv("/content/drive/MyDrive/Train_Dataset.csv")
df_test = pd.read_csv('/content/drive/MyDrive/Test_Dataset.csv')
print(df_train.head())

                                            headline  is_sarcastic
0  supreme court votes 7-2 to legalize all worldl...             1
1  hungover man horrified to learn he made dozens...             1
2  emily's list founder: women are the 'problem s...             0
3      send your kids back to school with confidence             0
4          watch: experts talk pesticides and health             0


In [190]:
# Function for removing punctuation in headlines
def remove_punctuations(x:str):
    lst = list(x)
    lst = [i for i in lst if i not in string.punctuation]
    final = ''.join(lst)
    return final

In [191]:
# Removing punctuation marks
df_train['headline'] = df_train['headline'].apply(lambda x: remove_punctuations(x))
df_test['headline'] = df_test['headline'].apply(lambda x: remove_punctuations(x))

# Expanding abbreviated words
df_train['headline'] = df_train['headline'].apply(lambda x: fix(x))
df_test['headline'] = df_test['headline'].apply(lambda x: fix(x))

# x, y separation of inputs and outputs of train dataframe
df_train = df_train[["headline", "is_sarcastic"]]
x = np.array(df_train["headline"])
y = np.array(df_train["is_sarcastic"])

# Converting text collection to number of token matrices using cv.fit_transform
cv = CountVectorizer()
X = cv.fit_transform(x)

# Brute force approach to find the size that gives the maximum score
score = 0
should_size = 0
for i in range(1, 100, 1):
  a = i * 0.01
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=a, random_state=1)
  model = BernoulliNB()
  model.fit(X_train, y_train)
  new_score = model.score(X_test, y_test)
  if ( new_score > score):
    score = new_score
    should_size = a

print(should_size, 'is the size that gives the max score', score)

0.13 is the size that gives the max score 0.8988705473501303


In [192]:
# Separation of 13% to 87% of the data set as train and test
# Random_state has been added for debug processes if necessary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=should_size, random_state=1)

# Creating and fitting a probabilistic Bernoulli Naive Bayes model
model = BernoulliNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

# Read each line in the test data frame, 
# estimate the model and write the result to the res_list. 
res_list = []
for w in df_test['headline']:
  data = cv.transform([w]).toarray()
  res_list.append(model.predict(data))

# Creating data frame using result list and then exporting to csv file
res_df = pd.DataFrame(res_list, columns=["prediction"])
res_df.to_csv('submission.csv', index=False)

0.8988705473501303
