<a href="https://colab.research.google.com/github/dohyun1411/Quora-Insincere-Questions-Classification/blob/main/prj_quora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Mount your Google drive**

In [1]:
from google.colab import drive
import os, re, io

drive.mount('/gdrive')
root = '/gdrive/My Drive/Colab Notebooks/Project_Quora'
os.environ['KAGGLE_CONFIG_DIR'] = "/gdrive/My Drive/Colab Notebooks/Project_Quora"
%cd $root

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/Colab Notebooks/Project_Quora


In [2]:
#download from kaggle and unzip
def download_file():
  !kaggle competitions download -c quora-insincere-questions-classification
  !unzip \*.zip
#download_file()

###**Install and Import libraries**

In [3]:
from platform import python_version
print('python', python_version())

python 3.7.10


In [4]:
import operator
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Bidirectional, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, GlobalMaxPool1D
from tensorflow.python.client import device_lib
from wordcloud import WordCloud
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def get_GPU():  
  local_device_protos = device_lib.list_local_devices()
  return [x.name for x in local_device_protos if x.device_type == 'GPU']
print(get_GPU())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

['/device:GPU:0']
Num GPUs Available:  1


###**Pre-processing**

In [6]:
train = pd.read_csv(root + "/train.csv")
test = pd.read_csv(root + "/test.csv")
print("Train shape: {} and Test shape: {}".format(train.shape, test.shape))
percentage_insincere = round((train["target"].values == 1).sum() / (train.shape[0]) * 100, 2)
print("Percentage of insincere questions in the train dataset: {}% ".format(percentage_insincere))

Train shape: (1306122, 3) and Test shape: (375806, 2)
Percentage of insincere questions in the train dataset: 6.19% 


In [7]:
#Reduce sample data to save time while keeping the same ratio
def reduce_data(sample_size, train, test):
  train_sample_insincere = train.loc[train['target'] == 1].sample(int((sample_size / 100) * percentage_insincere))
  train_sample_sincere = train.loc[train['target'] == 0].sample(int((sample_size / 100) * (100 - percentage_insincere)))
  train = pd.concat([train_sample_insincere, train_sample_sincere], ignore_index = True)
  train = shuffle(train)
  test = test.sample(int(sample_size / 4))
  return train, test

sample_size = 300000
#train, test = reduce_data(sample_size, train, test)
print("Train shape: {} and Test shape: {}".format(train.shape, test.shape))
percentage_insincere = round((train["target"].values == 1).sum() / (train.shape[0]) * 100, 2)
print("Percentage of insincere questions in the train dataset: {}% ".format(percentage_insincere))

Train shape: (1306122, 3) and Test shape: (375806, 2)
Percentage of insincere questions in the train dataset: 6.19% 


In [8]:
contractions = {
  "aren't": "are not", "can't": "cannot", "couldn't": "could not", "could've": "could have", "didn't": "did not", "doesn't": "does not",
  "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
  "he'd": "he would", "he'll": "he will", "he's": "he is", "i'd": "I would",
  "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not", "would've": "would have",
  "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
  "mightn't": "might not", "mayn't": "may not", "might've": "might have", "needn't": "need not",
  "mustn't": "must not", "shan't": "shall not", "she'd": "she would", "she'll": "she will",
  "she's": "she is", "shouldn't": "should not", "should've": "should have", "that's": "that is",
  "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are",
  "they've": "they have", "we'd": "we would", "we're": "we are", "weren't": "were not",
  "we've": "we have", "what'll": "what will", "what're": "what are", "what's": "what is",
  "what've": "what have", "where's": "where is", "who'd": "who would", "who'll": "who will",
  "who're": "who are", "who's": "who is", "who've": "who have", "who'll": "who will",
  "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
  "you're": "you are", "you've": "you have", "wasn't": "was not", "we'll": " will",
  "didn't": "did not", "y'all": "you all", "y'all'd": "you all would", "y'all're": "you all are"
}

def data_cleaning(text):
  text = text.lower()
  text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
  text = re.sub(r'#(\w+)', '', text)   # clean hashtags
  text = re.sub(r'@(\w+)', '', text)   # clean @s
  text = re.sub(r'<[^>]+>', '', text)  # clean tags
  text = re.sub(r'\d+', '', text)      # clean digits
  text = re.sub(r'’', '\'', text)      # replace ’ with '
  text = re.sub(r's\'', '', text)      # clean s'
  text = re.sub(r'[£₹$€₩]', ' ', text) # clean currency symbols
  text = re.sub(r'[δ∫βωδσ∈∆≡απθ+*-=°^×√÷]', ' ', text) # clean math symbols
  text = re.sub(r'[/(),!@"“”?.%_&#:;><{}~\[\]|…]', ' ', text)   # clean punctuation
  text = [contractions[word] if word in contractions else word for word in text.split()]  # change contractions to full forms
  text = [WordNetLemmatizer().lemmatize(word) for word in text] # lemmatize
  text = " ".join(text)
  text = re.sub(r'\'s', '', text)      # clean 's
  text = re.sub(r'\'', '', text)       # clean '
  return text

In [9]:
train['cleaned_question_text'] = train['question_text'].apply(data_cleaning)
test['cleaned_question_text'] = test['question_text'].apply(data_cleaning)
total_sentences = pd.concat([train['cleaned_question_text'], test['cleaned_question_text']], ignore_index = True)
#train.loc[train['target'] == 1].sample(5)

In [10]:
def cloud(text, title, size = (10, 7)):
  words_list = text.unique().tolist()
  words = ' '.join(words_list)
  wordcloud = WordCloud(width = 800, height = 400, collocations = False).generate(words)
    
  # Output Visualization
  fig = plt.figure(figsize = size, dpi = 80, facecolor='k',edgecolor='k')
  plt.imshow(wordcloud,interpolation = 'bilinear')
  plt.axis('off')
  plt.title(title, fontsize = 25,color = 'w')
  plt.tight_layout(pad = 0)
  plt.show()
    
#cloud(train[train['target'] == 0]['question_text_cleaned'], 'Cleaned Sincere questions')
#cloud(train[train['target'] == 1]['question_text_cleaned'], 'Cleaned Insincere questions')

Reference: https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing

In [11]:
def load_embeddings(file):
  def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
    
  if file == "./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin" :
    embeddings_index = KeyedVectors.load_word2vec_format(file, binary = True)
  elif file == "../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec" :
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o) > 100)
  else:
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
  return embeddings_index

paragram = "./paragram_300_sl999/paragram_300_sl999.txt"
glove = "./glove.840B.300d/glove.840B.300d.txt"
wiki_news = "./wiki-news-300d-1M/wiki-news-300d-1M.vec"
google_news = "./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
embedding_choice = paragram

def pick_embedding(embedding):
  print("Extracting " + embedding + " embedding")
  return load_embeddings(embedding)

embedding_method = pick_embedding(embedding_choice)

Extracting ./paragram_300_sl999/paragram_300_sl999.txt embedding


In [12]:
def vocabulary_builder(corpus):
  vocabulary = {}
  for text in corpus:
    for word in text.split():
      try:
        vocabulary[word] += 1
      except KeyError:
        vocabulary[word] = 1
  return vocabulary

def check_coverage(vocab, embeddings_index):
  known_words = {}
  unknown_words = {}
  nb_known_words = 0
  nb_unknown_words = 0
  for word in vocab.keys():
    try:
      known_words[word] = embeddings_index[word]
      nb_known_words += vocab[word]
    except:
      unknown_words[word] = vocab[word]
      nb_unknown_words += vocab[word]
      pass

  print('Found embeddings for {:.2%} of vocabulary set'.format(len(known_words) / len(vocab)))
  print('Found embeddings for {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
  unknown_words = sorted(unknown_words.items(), key = operator.itemgetter(1))[::-1]
  return unknown_words

def get_word_index(vocabulary):
  word_index = dict((w, i + 1) for i, w in enumerate(vocabulary.keys()))
  return word_index

def oov_check():
  print(embedding_choice)
  check_coverage(vocabulary_set, embedding_method)
  
vocabulary_set = vocabulary_builder(total_sentences)
vocabulary_size = len(vocabulary_set) + 1
word_index = get_word_index(vocabulary_set)
oov_check()
max_length = 250
#max_length = max([len(x) for x in total_sentences]) ~700

./paragram_300_sl999/paragram_300_sl999.txt
Found embeddings for 69.46% of vocabulary set
Found embeddings for 99.46% of all text


In [13]:
train_set, val_set = train_test_split(train, test_size = 0.2, random_state = 11)
train_set_Y, val_set_Y = np.array(train_set['target']), np.array(val_set['target'])

def use_encode():
  def fit_one_hot(word_index, corpus):
    sent = []
    for text in corpus:
      my_list = []
      for word in text.split():
        try:
          my_list.append(word_index[word])
        except KeyError:
          my_list.append(0)
      sent.append(my_list)
    return sent
  
  training_sequence, validation_sequence, testing_sequence = train_set['cleaned_question_text'], val_set['cleaned_question_text'], test['cleaned_question_text']
  encode_train_set, encode_val_set, encode_test_set = fit_one_hot(word_index, training_sequence), fit_one_hot(word_index, validation_sequence), fit_one_hot(word_index, testing_sequence)

  x = pad_sequences(encode_train_set, maxlen = max_length, padding = 'post', truncating = 'post')
  y = pad_sequences(encode_val_set, maxlen = max_length, padding = 'post', truncating = 'post')
  z = pad_sequences(encode_test_set, maxlen = max_length, padding = 'post', truncating = 'post')
  
  return x, y, z

def use_tokenize():
  tokenizer = Tokenizer(num_words = vocabulary_size, oov_token = "<OOV>")
  tokenizer.fit_on_texts(total_sentences)

  training_sequences = tokenizer.texts_to_sequences(train_set['cleaned_question_text'])
  validation_sequences = tokenizer.texts_to_sequences(val_set['cleaned_question_text'])
  testing_sequences = tokenizer.texts_to_sequences(test['cleaned_question_text'])

  z = pad_sequences(training_sequences, maxlen = max_length, padding = 'post', truncating = 'post')
  y = pad_sequences(validation_sequences, maxlen = max_length, padding = 'post', truncating = 'post')
  z = pad_sequences(testing_sequences, maxlen = max_length, padding = 'post', truncating = 'post')

  return x, y, z

training_padded, validation_padded, testing_padded = use_encode()
#training_padded, validation_padded, testing_padded = use_tokenize()

In [14]:
def embedding_matrices(embed_type):
  count = 0
  embedding_matrix = np.zeros((vocabulary_size, 300))
  for word,i in word_index.items():
    try:
      vec = embed_type[word]
      embedding_matrix[i] = vec
    except KeyError:
      count += 1
      continue
  return embedding_matrix, count

embedding_matrix, count = embedding_matrices(embedding_method)
print("Number of Out Of Vocabulary - OOVs: ", count)

Number of Out Of Vocabulary - OOVs:  60359


In [15]:
BATCH_SIZE = 2048
EPOCHS = 10
inputs = Input(shape = (max_length,))
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience = 2)

class MyModel(tf.keras.Model):
  def __init__(self):
    super(MyModel, self).__init__()
    self.embed = Embedding(vocabulary_size, 300, weights = [embedding_matrix], trainable = False)
    self.rnn = Sequential([
        Bidirectional(LSTM(128, return_sequences = True)),
        Bidirectional(LSTM(128, return_sequences = True)),
        Conv1D(128, 3, activation = "relu"),
        GlobalMaxPool1D(),
        Dense(64, activation = "relu"),
        Dropout(0.5),
        Dense(32, activation = "relu"),
        Dropout(0.2),
        Dense(1, activation = "sigmoid")
    ])

  def call(self, inputs):
    x = self.embed(inputs)
    x = self.rnn(x)
    return x

model = MyModel()
model.compile(loss = 'binary_crossentropy', optimizer = 'Adam', metrics = ['binary_accuracy'])
model.fit(x = training_padded, y = train_set_Y, batch_size = BATCH_SIZE, epochs = EPOCHS, callbacks = callback, validation_data = (validation_padded, val_set_Y))
predicted = model.predict(testing_padded, batch_size = 512)
test['prediction'] = predicted

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
test["#"] = [0 if x < 0.5 else 1 for x in test["prediction"]]
percentage_insincere = round((test["#"].values == 1).sum() / (test.shape[0]) * 100, 2)
print("Percentage of insincere questions in the test dataset: {}% ".format(percentage_insincere))

Percentage of insincere questions in the test dataset: 4.27% 


In [17]:
submission = test.drop(["prediction", "cleaned_question_text"], axis=1)
submission.to_csv(root + "/submission.csv")

In [18]:
#check
subm = pd.read_csv(root + "/submission.csv")
subm[["question_text", "#"]].loc[subm["#"] == 1].sample(10)


Unnamed: 0,question_text,#
266920,Why do Indians stalk girls on Quora?,1
11570,Should I name my girl Pussy?,1
84321,Are there any liberals who actually call thems...,1
280696,Is it true all Quora moderators make minimum w...,1
363465,Why aren’t my African American sisters more up...,1
355675,Do Muslim refugees have a higher percentage of...,1
112695,Is China really bad as Chinese people think?,1
218631,People don't care about people lying or morals...,1
312581,Why are Liberals a bunch of cowards?,1
278769,"Given the religious bigotry, misogyny, scorn o...",1
