# Objective
We want to build a model that accurately predicts questions with same intent to reduce duplicated answer and confusion among the users. Accurately predicting duplicated questions and removing them will allow users to find high quality answers to questions resulting in improved experience for writers, seekers and readers

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Prep Data

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [None]:
path = '/content/drive/My Drive/Data/quora_train.csv'
train_data = pd.read_csv(path)

In [None]:
question_1 = train_data.iloc[:, 3]
question_2 = train_data.iloc[:, 4]
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
# create vocab list for validity purpose

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

def clean_question(questions):
  tokens = word_tokenize(questions)
  tokens = [t for t in tokens if t.isalpha()]
  stop_words = set(stopwords.words('english'))
  tokens = [t for t in tokens if not t in stop_words]
  tokens = [t for t in tokens if len(t) > 1]
  return tokens

def add_to_vocab(questions, vocab):
  tokens = clean_question(questions)
  vocab.update(tokens)

def process_question(questions, vocab):
  for q in questions:
    q = str(q)
    add_to_vocab(q, vocab)

def save_list(tokens, filepath):
  data = '\n'.join(tokens)
  file = open(filepath, 'w')
  file.write(data)
  file.close()

question_1 = train_data.iloc[:, 3]
question_2 = train_data.iloc[:, 4]
questions = list(question_1) + list(question_2)
vocab = Counter()
process_question(questions, vocab)
print(len(vocab))

# remove low occurence words
min_occurence = 2
tokens = [k for k,c in vocab.items() if c > min_occurence]

# save vocab
path = '/content/drive/My Drive/Data/'
file_name = 'quora_question_vocab.txt'
save_list(tokens, path+file_name)

# Train Embedding Layer

In [None]:
# create some functions to use

def load_doc(filepath):
  file = open(filepath,'r')
  text = file.read()
  file.close()
  return text

def clean_question(question, vocab):
  tokens = word_tokenize(question)
  tokens = [w for w in tokens if w.isalpha()]
  tokens = [w for w in tokens if w in vocab]
  tokens = ' '.join(tokens)
  return tokens

def process_question(question,vocab):
  clean_q = list()
  for q in question:
    q = str(q)
    qs = clean_question(q, vocab)
    clean_q.append(qs)
  return clean_q

In [None]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# load vocab
vocab_filename = 'quora_question_vocab.txt'
path = '/content/drive/My Drive/Data/'
vocab = load_doc(path+vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load traning dataset
q1_cleaned = process_question(question_1, vocab)
q2_cleaned = process_question(question_2, vocab)

# traint test split
VALIDATION_SPLIT = 0.4
idx_split = round(len(q1_cleaned) * (1-VALIDATION_SPLIT))
q1_split_train = q1_cleaned[:idx_split]
q2_split_train = q2_cleaned[:idx_split]
train_questions = q1_split_train + q2_split_train
q1_split_test = q1_cleaned[idx_split:]
q2_split_test = q2_cleaned[idx_split:]

# create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_questions)
max_length = max([len(s.split()) for s in train_questions])

# encode docs 
encoded_doc_q1 = tokenizer.texts_to_sequences(q1_split_train)
encoded_doc_q2 = tokenizer.texts_to_sequences(q2_split_train)
# pad sequences
q1_train = pad_sequences(encoded_doc_q1, maxlen = max_length, padding = 'post')
q2_train = pad_sequences(encoded_doc_q2, maxlen = max_length, padding = 'post')

# test dataset
encoded_q1_test = tokenizer.texts_to_sequences(q1_split_test)
encoded_q2_test = tokenizer.texts_to_sequences(q2_split_test)
q1_test = pad_sequences(encoded_q1_test, maxlen = max_length, padding = 'post')
q2_test = pad_sequences(encoded_q2_test, maxlen = max_length, padding = 'post')

# define target variable
target = np.array(train_data['is_duplicate'])
target_train = target[:idx_split]
target_test = target[idx_split:]

# Build Model with Neural Network

## With Embedding and CNN

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D, Input, dot, Concatenate
import keras.backend as K

K.clear_session()

# # train test split for faster training and validation
# perm = np.random.permutation(len(q1_train))
# VALIDATION_SPLIT = 0.4
# idx_train = perm[:int(len(q1_train)*(1-VALIDATION_SPLIT))] 
# idx_test = perm[int(len(q1_train)*(1-VALIDATION_SPLIT)):]

# data_1_train = q1_train[idx_train]
# data_2_train = q2_train[idx_train]
# target_train = y_train[idx_train]

# data_1_test = q1_train[idx_test]
# data_2_test = q2_train[idx_test]
# target_test = y_train[idx_test]

# define vocab size 
vocab_size = len(tokenizer.word_index) + 1

# Embedding
q1_input = Input(shape=(max_length,))
q1_embedding = Embedding(vocab_size, 150, input_length = max_length, )(q1_input)
x = Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(q1_embedding)
x = MaxPooling1D(pool_size = 4)(x)
x = Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = MaxPooling1D(pool_size = 4)(x)
q1_vector = Flatten()(x)

q2_input = Input(shape = (max_length,))
q2_embedding = Embedding(vocab_size, 150, input_length = max_length)(q2_input)
x = Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(q2_embedding)
x = MaxPooling1D(pool_size = 4)(x)
x = Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = MaxPooling1D(pool_size = 4)(x)
q2_vector = Flatten()(x)

# connect to model
prod = Concatenate(axis = 1)([q1_vector, q2_vector])
# prod = dot([q1_vector, q2_vector], axes = 1) # cosine similarity
x = Dense(128, activation = 'relu')(prod)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs = [q1_input, q2_input], outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit([q1_train, q2_train], target_train, epochs = 10, verbose = 2, batch_size = 64,
          validation_data = ([q1_test, q2_test], target_test))
print(model.summary())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 242574 samples, validate on 161716 samples
Epoch 1/10
 - 60s - loss: 0.5125 - accuracy: 0.7488 - val_loss: 0.4793 - val_accuracy: 0.7699
Epoch 2/10
 - 53s - loss: 0.3734 - accuracy: 0.8291 - val_loss: 0.4811 - val_accuracy: 0.7775
Epoch 3/10
 - 53s - loss: 0.2290 - accuracy: 0.9015 - val_loss: 0.5667 - val_accuracy: 0.7705
Epoch 4/10
 - 53s - loss: 0.1430 - accuracy: 0.9412 - val_loss: 0.7044 - val_accuracy: 0.7640
Epoch 5/10
 - 53s - loss: 0.1011 - accuracy: 0.9600 - val_loss: 0.8792 - val_accuracy: 0.7622
Epoch 6/10
 - 53s - loss: 0.0772 - accuracy: 0.9704 - val_loss: 0.9648 - val_accuracy: 0.7640
Epoch 7/10
 - 53s - loss: 0.0604 - accuracy: 0.9766 - val_loss: 1.0670 - val_accuracy: 0.7674
Epoch 8/10
 - 52s - loss: 0.0503 - accuracy: 0.9810 - val_loss: 1.1707 - val_accuracy: 0.7687
Epoch 9/10
 - 52s - loss: 0.0431 - accuracy: 0.9839 - val_loss: 1.1986 - val_accuracy: 0.7697
Epoch 10/10
 - 52s - loss: 0.0377 - accuracy: 0.9860 - val_loss: 1.3349 - val_accuracy: 0.7589
Model: 

## With word2vec Embedding

### train and save embedding

In [None]:
# define some functions to create and save
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

def load_doc(filepath):
  file = open(filepath,'r')
  text = file.read()
  file.close()
  return text

def clean_question(question, vocab):
  q = str(question)
  tokens = word_tokenize(q)
  tokens = [w for w in tokens if w.isalpha()]
  tokens = [w for w in tokens if w in vocab]
  return tokens

def process_question(question,vocab):
  clean_q = list()
  for q in question:
    q = str(q)
    qs = clean_question(q, vocab)
    clean_q.append(qs)
  return clean_q

# load vocab
vocab_filename = 'quora_question_vocab.txt'
path = '/content/drive/My Drive/Data/'
vocab = load_doc(path+vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# process training set
question_1 = train_data.iloc[:, 3]
question_2 = train_data.iloc[:, 4]
q1_cleaned = process_question(question_1, vocab)
q2_cleaned = process_question(question_2, vocab)
sentences = q1_cleaned + q2_cleaned
print(f''' Total Training Sentences: {len(sentences)}''')

# train word2vec model
model = Word2Vec(sentences, size = 100, window = 10, workers = 15, min_count = 1)
words = list(model.wv.vocab)
print(f'''Total number of words: {len(words)}''')

# save model
path = '/content/drive/My Drive/Data/'
filename = 'quora_w2v_embedding.txt'
model.wv.save_word2vec_format(path+filename, binary = False)

 Total Training Sentences: 808580
Total number of words: 44818


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### build model 

In [None]:
# make some functions for model building
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

def load_doc(filepath):
  file = open(filepath,'r')
  text = file.read()
  file.close()
  return text

def clean_question(question, vocab):
  question = str(question)
  tokens = word_tokenize(question)
  tokens = [w for w in tokens if w.isalpha()]
  tokens = [w for w in tokens if w in vocab]
  tokens = ' '.join(tokens)
  return tokens

def process_question(question,vocab):
  clean_q = []
  for q in question:
    q = str(q)
    qs = clean_question(q, vocab)
    clean_q.append(qs)
  return clean_q

def load_embedding(filepath):
  file = open(filepath,'r')
  lines = file.readlines()[1:]
  file.close()
  embeddings = {}
  for line in lines:
    parts = line.split()
    word = parts[0]
    vec = np.asarray(parts[1:], dtype = 'float32')
    embeddings[word] = vec
  return embeddings

def get_weight_matrix(embedding, vocab):
  vocab_size = len(vocab) + 1
  weight_matrix = np.zeros((vocab_size, 100))
  for word, c in vocab.items():
    weight_matrix[c] = embedding.get(word)
  return weight_matrix

Using TensorFlow backend.


In [None]:
temp = question_1[:20]
process_question(temp, vocab)

['What step step guide invest share market india',
 'What story Kohinoor Diamond',
 'How increase speed internet connection using VPN',
 'Why mentally lonely How solve',
 'Which one dissolve water quikly sugar salt methane carbon di oxide',
 'Astrology Capricorn Sun Cap moon cap rising say',
 'Should buy',
 'How good geologist',
 'When use instead',
 'Motorola company Can hack Charter',
 'Method find separation slits using',
 'How read find YouTube comments',
 'What make Physics easy learn',
 'What first sexual experience like',
 'What laws change status student visa green card US compare immigration laws Canada',
 'What would Trump presidency mean current international master students visa',
 'What manipulation mean',
 'Why girls want friends guy reject',
 'Why many Quora users posting questions readily answered Google',
 'Which best digital marketing institution banglore']

In [None]:
# load vocab
vocab_filename = 'quora_question_vocab.txt'
path = '/content/drive/My Drive/Data/'
vocab = load_doc(path+vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load training data
question_1 = train_data.iloc[:, 3]
question_2 = train_data.iloc[:, 4]
q1_cleaned = process_question(question_1, vocab)
q2_cleaned = process_question(question_2, vocab)

# train test split
VALIDATION_SPLIT = 0.4
idx_split = round(len(q1_cleaned) * (1-VALIDATION_SPLIT))
q1_split_train = q1_cleaned[:idx_split]
q2_split_train = q2_cleaned[:idx_split]
sentences = q1_split_train + q2_split_train
q1_split_test = q1_cleaned[idx_split:]
q2_split_test = q2_cleaned[idx_split:]

y_train = np.array(train_data['is_duplicate'])
target_train = y_train[:idx_split]
target_test = y_train[idx_split:]

# tokenize and sequence encode
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
encoded_q1 = tokenizer.texts_to_sequences(q1_split_train)
encoded_q2 = tokenizer.texts_to_sequences(q2_split_train)

# pad sequence
max_length = max([len(s.split()) for s in sentences]) # split to count number of words in each list of list for max padding
q1_train = pad_sequences(encoded_q1, maxlen = max_length, padding = 'post')
q2_train = pad_sequences(encoded_q2, maxlen = max_length, padding = 'post')

# do the same for test set
encoded_q1_test = tokenizer.texts_to_sequences(q1_split_test)
encoded_q2_test = tokenizer.texts_to_sequences(q2_split_test)
q1_test = pad_sequences(encoded_q1_test, maxlen = max_length, padding = 'post')
q2_test = pad_sequences(encoded_q2_test, maxlen = max_length, padding = 'post')

# define vocab size 
vocab_size = len(tokenizer.word_index) + 1

# load and define embedding layer
embedding_name = 'quora_w2v_embedding.txt'
raw_embedding = load_embedding(path+embedding_name)
embedding_vector = get_weight_matrix(raw_embedding, tokenizer.word_index)
embedding_layer = Embedding(vocab_size, 100, input_length = max_length, weights = [embedding_vector],
                            trainable =  False)

In [None]:
print(q1_train.shape, q2_train.shape, q1_test.shape, q2_test.shape)

(242574, 113) (242574, 113) (161716, 113) (161716, 113)


In [None]:
print(target_train.shape, target_test.shape)

(242574,) (161716,)


In [None]:
from keras.models import Model
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Input, dot, Concatenate
import keras.backend as K

# build model
K.clear_session()

q1_input = Input(shape = (max_length,))
q1_embedding = embedding_layer(q1_input)
x = Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(q1_embedding)
x = MaxPooling1D(pool_size = 4)(x)
q1_vector = Flatten()(x)

q2_input = Input(shape = (max_length,))
q2_embedding = embedding_layer(q2_input)
x = Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(q2_embedding)
x = MaxPooling1D(pool_size = 4)(x)
q2_vector = Flatten()(x)

merged = Concatenate(axis = 1)([q1_input, q2_input]) # embedding already pre trained
prod = Dense(128, activation = 'relu')(merged)
prod = Dense(1, activation = 'sigmoid')(prod)

model = Model(inputs = [q1_input, q2_input], outputs = prod)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit([q1_train, q2_train], target_train, epochs = 10, verbose = 2, batch_size = 64,
          validation_data = ([q1_test, q2_test], target_test))

Train on 242574 samples, validate on 161716 samples
Epoch 1/10
 - 13s - loss: 18.1667 - accuracy: 0.5966 - val_loss: 5.2069 - val_accuracy: 0.6137
Epoch 2/10
 - 13s - loss: 2.9318 - accuracy: 0.6099 - val_loss: 1.4218 - val_accuracy: 0.6355
Epoch 3/10
 - 13s - loss: 0.9006 - accuracy: 0.6178 - val_loss: 0.7179 - val_accuracy: 0.6724
Epoch 4/10
 - 13s - loss: 0.6385 - accuracy: 0.6507 - val_loss: 0.6249 - val_accuracy: 0.6696
Epoch 5/10
 - 13s - loss: 0.6210 - accuracy: 0.6572 - val_loss: 0.6192 - val_accuracy: 0.6613
Epoch 6/10
 - 13s - loss: 0.6202 - accuracy: 0.6546 - val_loss: 0.6178 - val_accuracy: 0.6650
Epoch 7/10
 - 13s - loss: 0.6200 - accuracy: 0.6529 - val_loss: 0.6230 - val_accuracy: 0.6358
Epoch 8/10
 - 13s - loss: 0.6177 - accuracy: 0.6528 - val_loss: 0.6188 - val_accuracy: 0.6666
Epoch 9/10
 - 13s - loss: 0.6187 - accuracy: 0.6577 - val_loss: 0.6178 - val_accuracy: 0.6612
Epoch 10/10
 - 14s - loss: 0.6179 - accuracy: 0.6568 - val_loss: 0.6224 - val_accuracy: 0.6621


<keras.callbacks.callbacks.History at 0x7f4002b88eb8>

# Build Model With Gradient Boosting


In [None]:
# clean questions
from nltk.tokenize import word_tokenize

def load_doc(filepath):
  file = open(filepath, 'r')
  text = file.read()
  file.close()
  return text

def clean_question(question, vocab):
  tokens = word_tokenize(question)
  tokens = [t for t in tokens if t.isalpha()]
  tokens = [t for t in tokens if t in vocab]
  tokens = ' '.join(tokens)
  return tokens

def process_question(question, vocab):
  clean_q = []
  for q in question:
    qs = str(q)
    qs = clean_question(qs, vocab)
    clean_q.append(qs)
  return clean_q

In [None]:
# load vocab
vocab_filename = 'quora_question_vocab.txt'
path = '/content/drive/My Drive/Data/'
vocab = load_doc(path+vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# process question
q1_cleaned = process_question(question_1, vocab)
q2_cleaned = process_question(question_2, vocab)

In [None]:
q1_cleaned[:10]

['What step step guide invest share market india',
 'What story Kohinoor Diamond',
 'How increase speed internet connection using VPN',
 'Why mentally lonely How solve',
 'Which one dissolve water quikly sugar salt methane carbon di oxide',
 'Astrology Capricorn Sun Cap moon cap rising say',
 'Should buy',
 'How good geologist',
 'When use instead',
 'Motorola company Can hack Charter']

## Bag of words + XGB

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import scipy as sp

c_vect = CountVectorizer()
c_vect.fit(q1_cleaned + q2_cleaned)
train_q1_trans = c_vect.transform(q1_cleaned)
train_q2_trans = c_vect.transform(q2_cleaned)

X = sp.sparse.hstack((train_q1_trans, train_q2_trans))
y = train_data['is_duplicate'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4)

In [None]:
import hyperopt
from hyperopt import fmin, STATUS_OK, Trials, hp, tpe

# Hyperparameter tuning
def objective(space):
  model = XGBClassifier(learning_rate = space['learning_rate'], colsample_bytree = space['colsample_bytree'], max_depth = space['max_depth'],
                        gamma = space['gamma'], subsample = space['subsample'], n_estimators = space['n_estimators'], objective = 'binary:logistic')
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  score = f1_score(y_test, y_pred)
  return {'loss': -score , 'status': STATUS_OK}

# define search space
def optimize(evals, trials, algo, random_state = 42):
  space = {
      'learning_rate': hp.quniform('learning_rate',0.01, 0.1, 0.01),
      'colsample_bytree': hp.quniform('colsample_bytree',0.5, 1, 0.1),
      'max_depth': hp.choice('max_depth', np.arange(30, 70, 10, dtype = int)),
      'gamma': hp.uniform('gamma',0,3),
      'subsample': hp.quniform('subsample',0.3, 1, 0.1),
      'n_estimators': hp.choice('n_estimators', np.arange(80, 150, 10, dtype = int)),
      'objective' : 'binary:logistic'
  }
  best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = evals, trials = trials)
  return best


trials = Trials()
best_params = optimize(evals = 10,
                       trials = trials,
                        algo = tpe.suggest)
print(best_params)

100%|██████████| 10/10 [1:08:34<00:00, 411.41s/it, best loss: -0.6155718501840752]
{'colsample_bytree': 1.0, 'gamma': 2.4475221224301578, 'learning_rate': 0.06, 'max_depth': 3, 'n_estimators': 6, 'subsample': 0.6000000000000001}


In [None]:
model = XGBClassifier(colsample_bytree = best_params['colsample_bytree'], gamma = best_params['gamma'], learning_rate = best_params['learning_rate'], 
                       max_depth = 50, n_estimators = 80, subsample = best_params['subsample'])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('training score:', accuracy_score(y_train, model.predict(X_train)))
print('test score:', accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, y_pred))

training score: 0.7802649913016234
test score: 0.7525971456132974
              precision    recall  f1-score   support

           0       0.74      0.93      0.83    101973
           1       0.79      0.45      0.58     59743

    accuracy                           0.75    161716
   macro avg       0.77      0.69      0.70    161716
weighted avg       0.76      0.75      0.73    161716



## TF-IDF + LGBM

In [None]:
t_vect = TfidfVectorizer()
t_vect.fit(q1_cleaned+q2_cleaned)
q1_trans = t_vect.transform(q1_cleaned)
q2_trans = t_vect.transform(q2_cleaned)

X = sp.sparse.hstack((q1_trans, q2_trans))
y = train_data['is_duplicate'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.4)

In [None]:
# Hyperparameter tuning

def objective(space):
  model = XGBClassifier(learning_rate = space['learning_rate'], colsample_bytree = space['colsample_bytree'], max_depth = space['max_depth'],
                        gamma = space['gamma'], subsample = space['subsample'], n_estimators = space['n_estimators'], objective = 'binary:logistic')
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  score = f1_score(y_test, y_pred)
  return {'loss': -score , 'status': STATUS_OK}

# define search space
def optimize(evals, trials, algo, random_state = 42):
  space = {
      'learning_rate': hp.quniform('learning_rate',0.01, 0.1, 0.01),
      'colsample_bytree': hp.quniform('colsample_bytree',0.5, 1, 0.1),
      'max_depth': hp.choice('max_depth', np.arange(30, 70, 10, dtype = int)),
      'gamma': hp.uniform('gamma',0,3),
      'subsample': hp.quniform('subsample',0.3, 1, 0.1),
      'n_estimators': hp.choice('n_estimators', np.arange(80, 150, 10, dtype = int)),
      'objective' : 'binary:logistic'
  }
  best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = evals, trials = trials)
  return best


trials = Trials()
best_params = optimize(evals = 10,
                       trials = trials,
                        algo = tpe.suggest)
print(best_params)

100%|██████████| 10/10 [1:29:22<00:00, 536.22s/it, best loss: -0.6453779209209896]
{'colsample_bytree': 0.8, 'gamma': 1.2937410154165065, 'learning_rate': 0.07, 'max_depth': 1, 'n_estimators': 2, 'subsample': 0.9}


In [None]:
# model
model = XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, 
                      reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('training score:', f1_score(y_train, model.predict(X_train)))
print('test score:', f1_score(y_test, model.predict(X_test)))
print(classification_report(y_test, y_pred))


training score: 0.7339071712688809
test score: 0.6433254074452006
              precision    recall  f1-score   support

           0       0.78      0.90      0.83    101941
           1       0.76      0.56      0.64     59775

    accuracy                           0.77    161716
   macro avg       0.77      0.73      0.74    161716
weighted avg       0.77      0.77      0.76    161716

