# Train Models
<div style="color:red; font-size:14px;">!! Don't define functions here, import them from utils.py</div>

This notebook contains the code needed to train and store models to disk.

Remember that if you use a function with a random state you have to fix it to a number so that the results are reproducible.

## Imports

In [None]:
import pandas as pd
import sklearn
from sklearn import *
import os
import pickle

from utils import *

In [None]:
home_dir = os.environ['HOME']
path_folder_quora = home_dir + '/Datasets/QuoraQuestionPairs'

In [None]:
path_folder_quora

In [None]:
train_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_train_data.csv'))
test_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_test_data.csv'))

A_df, te_df = sklearn.model_selection.train_test_split(train_df,
                                                       test_size=0.05,
                                                       random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,
                                                        test_size=0.05,
                                                        random_state=123)
y_tr = tr_df['is_duplicate'].values
X_tr_df = tr_df.drop(['is_duplicate'], axis =1)

y_va = va_df['is_duplicate'].values
X_va_df = va_df.drop(['is_duplicate'], axis =1)

y_te = te_df['is_duplicate'].values
X_te_df = te_df.drop(['is_duplicate'], axis =1)

print('X_tr_df.shape=',X_tr_df.shape)
print('y_tr.shape=',y_tr.shape)
print('X_va.shape=',X_va_df.shape)
print('y_va_df.shape=',y_tr.shape)
print('X_te.shape=',X_te_df.shape)
print('y_tr_df.shape=',y_tr.shape)

## Explore data

In [None]:
train_df.head()

## Create question database

In [None]:
# concatenate qid1 and qid2 into a new column called "qid"
qid1 = train_df[['qid1', 'question1']].rename(columns={'qid1': 'qid', 'question1': 'question'})
qid2 = train_df[['qid2', 'question2']].rename(columns={'qid2': 'qid', 'question2': 'question'})
qid_df = pd.concat([qid1, qid2])

# drop any duplicate rows based on "qid" column
qid_df = qid_df.drop_duplicates(subset=['qid'])

# sort the dataframe by "qid"
qid_df = qid_df.sort_values(by=['qid'])

# reset the index of the dataframe
qid_df = qid_df.reset_index(drop=True)

In [None]:
qid_df.head()

## Simple Solution

In [None]:
# Build corpus combining all questions in a list
all_q1 = list(X_tr_df["question1"])
all_q2 = list(X_tr_df["question2"])
all_questions = all_q1 + all_q2

len(all_questions)

In [None]:
# Cast lists as strings
all_questions = cast_list_as_strings(all_questions)

In [None]:
# Train and transform using Count Vectorizer
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

X_tr_q1q2 = get_features_from_df(X_tr_df, count_vectorizer)
X_va_q1q2 = get_features_from_df(X_va_df, count_vectorizer)
X_te_q1q2  = get_features_from_df(X_te_df, count_vectorizer)

X_tr_q1q2.shape, tr_df.shape, X_va_q1q2.shape, va_df.shape, te_df.shape, X_te_q1q2.shape

### Train model

In [None]:
# Train Logistic Regression Model
lr_model = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
lr_model.fit(X_tr_q1q2, y_tr)

### Save model

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")

if not os.path.isdir("model_artifacts/simple_solution"):
        os.mkdir("model_artifacts/simple_solution")
        # Save model and validation and test datasets
        with open('model_artifacts/simple_solution/lr_model.pkl', 'wb') as file:
            pickle.dump(lr_model, file)
        with open('model_artifacts/simple_solution/X_tr_q1q2.pkl', 'wb') as file:
            pickle.dump(X_tr_q1q2, file)
        with open('model_artifacts/simple_solution/y_tr.pkl', 'wb') as file:
            pickle.dump(y_tr, file)
        with open('model_artifacts/simple_solution/X_va_q1q2.pkl', 'wb') as file:
            pickle.dump(X_va_q1q2, file)
        with open('model_artifacts/simple_solution/y_va.pkl', 'wb') as file:
            pickle.dump(y_va, file)
        with open('model_artifacts/simple_solution/X_te_q1q2.pkl', 'wb') as file:
            pickle.dump(X_te_q1q2, file)
        with open('model_artifacts/simple_solution/y_te.pkl', 'wb') as file:
            pickle.dump(y_te, file)
        with open('model_artifacts/simple_solution/qid_df.pkl', 'wb') as file:
            pickle.dump(qid_df, file)

## Improvement proposals

In [None]:
# Create copy of question database to apply preprocessing for improve baseline solution
qid_df_preprocess = qid_df.copy()

<div class="alert" style="padding: 20px;background-color: #2cbc84; color: white; margin-bottom: 15px;">
Baseline solution
</div>

#### Text preprocessing
- Contractions and abbreviations: normalize_text
- Remove punctuation: remove_punctuation
- Spellchecker
- Remove stopwords: remove_stopwords
- Remove accents: remove_accents
- Special tokens: special_tokens
- Normalize spaces: normalize_spaces

##### Cast list as strings

In [None]:
qid_df_preprocess['question'] = cast_list_as_strings(list(qid_df_preprocess["question"]))

In [None]:
qid_df_preprocess.head()

##### Text Normalization

In [None]:
# Dictionary of common contractions and their expanded form
contractions_dict = {
    "ain't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

# Dictionary of common abbreviations and their full form
abbreviations_dict = {
    "aka": "also known as",
    "asap": "as soon as possible",
    "btw": "by the way",
    "etc": "et cetera",
    "e.g.": "for example",
    "i.e.": "that is",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "thx": "thanks",
    "wtf": "what the fuck"
}

qid_df_preprocess['question'] = qid_df_preprocess['question'].apply(lambda x: normalize_text(x, contractions_dict, abbreviations_dict))

In [None]:
qid_df_preprocess.head()

##### Remove punctuation

In [None]:
qid_df_preprocess['question'] = qid_df_preprocess['question'].apply(lambda x: remove_punctuation(x))

In [None]:
qid_df_preprocess.head()

##### Remove stopwords

In [None]:
stop_words = set([
    'the', 'and', 'to', 'in', 'of', 'that', 'is', 'it', 'for',
    'on', 'this', 'you', 'be', 'are', 'or', 'from', 'at', 'by', 'we',
    'an', 'not', 'have', 'has', 'but', 'as', 'if', 'so', 'they', 'their',
    'was', 'were','some', 'there', 'these', 'those', 'than', 'then', 'been', 'also',
    'much', 'many', 'other'
])

qid_df_preprocess['question'] = qid_df_preprocess['question'].apply(lambda x: remove_stopwords(x, stop_words))

In [None]:
qid_df_preprocess.head()

##### Remove accents

In [None]:
qid_df_preprocess['question'] = qid_df_preprocess['question'].apply(lambda x: remove_accents(x))

In [None]:
qid_df_preprocess.head()

##### Special tokens

In [None]:
from collections import Counter, defaultdict
word_counts = Counter(word for sentence in all_questions for word in tokenize_text(sentence))
# Create a defaultdict
word_counts = defaultdict(lambda: 0, word_counts)
# Words that only appears one
word_counts_one = {k: v for k, v in word_counts.items() if v == 1}

qid_df_preprocess['question'] = qid_df_preprocess['question'].apply(lambda x: special_tokens(x, word_counts_one))

In [None]:
qid_df_preprocess.head()

##### Normalize spaces

In [None]:
qid_df_preprocess['question'] = qid_df_preprocess['question'].apply(lambda x: normalize_spaces(x))

In [None]:
qid_df_preprocess.head()

##### Save preprocessed question

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")
with open('model_artifacts/qid_df.pkl', 'wb') as file:
        pickle.dump(qid_df_preprocess, file)

#### Feature engineering

We start by replacing the questions in our training, validation and test sets by the preprocessed questions.

In [None]:
# ----------------------- TRAINING SET ___________________
X_tr_df = X_tr_df.merge(qid_df_preprocess, left_on='qid1', right_on='qid', how='left').drop(columns=['qid'])
X_tr_df = X_tr_df.merge(qid_df_preprocess, left_on='qid2', right_on='qid', how='left', suffixes=['_1', '_2']).drop(columns=['qid'])

# replace the values in the question1 and question2 columns
X_tr_df['question1'] = X_tr_df['question_1']
X_tr_df['question2'] = X_tr_df['question_2']

# drop the additional question_1 and question_2 columns
X_tr_df = X_tr_df.drop(columns=['question_1', 'question_2'])

# remove empty questions
mask = (X_tr_df['question1'].str.len() > 0) & (X_tr_df['question2'].str.len() > 0)
X_tr_df = X_tr_df[mask]
y_tr = y_tr[mask]

# ----------------------- VALIDATION SET ___________________
X_va_df = X_va_df.merge(qid_df_preprocess, left_on='qid1', right_on='qid', how='left').drop(columns=['qid'])
X_va_df = X_va_df.merge(qid_df_preprocess, left_on='qid2', right_on='qid', how='left', suffixes=['_1', '_2']).drop(columns=['qid'])

# replace the values in the question1 and question2 columns
X_va_df['question1'] = X_va_df['question_1']
X_va_df['question2'] = X_va_df['question_2']

# drop the additional question_1 and question_2 columns
X_va_df = X_va_df.drop(columns=['question_1', 'question_2'])

# remove empty questions
mask = (X_va_df['question1'].str.len() > 0) & (X_va_df['question2'].str.len() > 0)
X_va_df = X_va_df[mask]
y_va = y_va[mask]

# ----------------------- TEST SET ___________________
X_te_df = X_te_df.merge(qid_df_preprocess, left_on='qid1', right_on='qid', how='left').drop(columns=['qid'])
X_te_df = X_te_df.merge(qid_df_preprocess, left_on='qid2', right_on='qid', how='left', suffixes=['_1', '_2']).drop(columns=['qid'])

# replace the values in the question1 and question2 columns
X_te_df['question1'] = X_te_df['question_1']
X_te_df['question2'] = X_te_df['question_2']

# drop the additional question_1 and question_2 columns
X_te_df = X_te_df.drop(columns=['question_1', 'question_2'])

# remove empty questions
mask = (X_te_df['question1'].str.len() > 0) & (X_te_df['question2'].str.len() > 0)
X_te_df = X_te_df[mask]
y_te = y_te[mask]

In [None]:
X_tr_df.head()

##### Features

In [None]:
X_tr_df['unique_words_count'] = X_tr_df.apply(lambda x: num_of_unique_words(x['question1'], x['question2']), axis=1)
X_va_df['unique_words_count'] = X_va_df.apply(lambda x: num_of_unique_words(x['question1'], x['question2']), axis=1)
X_te_df['unique_words_count'] = X_te_df.apply(lambda x: num_of_unique_words(x['question1'], x['question2']), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['diff_word_count'] = X_tr_df.apply(lambda x: difference_word_count(x['question1'], x['question2']), axis=1)
X_va_df['diff_word_count'] = X_va_df.apply(lambda x: difference_word_count(x['question1'], x['question2']), axis=1)
X_te_df['diff_word_count'] = X_te_df.apply(lambda x: difference_word_count(x['question1'], x['question2']), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['common_word_count'] = X_tr_df.apply(lambda x: common_words_count(x), axis=1)
X_va_df['common_word_count'] = X_va_df.apply(lambda x: common_words_count(x), axis=1)
X_te_df['common_word_count'] = X_te_df.apply(lambda x: common_words_count(x), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['common_word_ratio'] = X_tr_df.apply(lambda x: common_words_ratio(x), axis=1)
X_va_df['common_word_ratio'] = X_va_df.apply(lambda x: common_words_ratio(x), axis=1)
X_te_df['common_word_ratio'] = X_te_df.apply(lambda x: common_words_ratio(x), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['first_word_same'] = X_tr_df.apply(lambda x: first_word_equal(x), axis=1)
X_va_df['first_word_same'] = X_va_df.apply(lambda x: first_word_equal(x), axis=1)
X_te_df['first_word_same'] = X_te_df.apply(lambda x: first_word_equal(x), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['last_word_same'] = X_tr_df.apply(lambda x: last_word_equal(x), axis=1)
X_va_df['last_word_same'] = X_va_df.apply(lambda x: last_word_equal(x), axis=1)
X_te_df['last_word_same'] = X_te_df.apply(lambda x: last_word_equal(x), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['fuzz_ratio'] = X_tr_df.apply(lambda x: fuzz_ratio(x), axis=1)
X_va_df['fuzz_ratio'] = X_va_df.apply(lambda x: fuzz_ratio(x), axis=1)
X_te_df['fuzz_ratio'] = X_te_df.apply(lambda x: fuzz_ratio(x), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['diff_char_count'] = X_tr_df.apply(lambda x: abs(num_of_characters(x['question1']) - num_of_characters(x['question2'])), axis=1)
X_va_df['diff_char_count'] = X_va_df.apply(lambda x: abs(num_of_characters(x['question1']) - num_of_characters(x['question2'])), axis=1)
X_te_df['diff_char_count'] = X_te_df.apply(lambda x: abs(num_of_characters(x['question1']) - num_of_characters(x['question2'])), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['unique_words_ratio'] = X_tr_df.apply(lambda x: total_unique_words_ratio(x['question1'], x['question2']), axis=1)
X_va_df['unique_words_ratio'] = X_va_df.apply(lambda x: total_unique_words_ratio(x['question1'], x['question2']), axis=1)
X_te_df['unique_words_ratio'] = X_te_df.apply(lambda x: total_unique_words_ratio(x['question1'], x['question2']), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['rare_word_count'] = X_tr_df.apply(lambda x: abs(rare_word_count(x['question1'], word_counts, 5) - rare_word_count(x['question2'], word_counts, 5)), axis=1)
X_va_df['rare_word_count'] = X_va_df.apply(lambda x: abs(rare_word_count(x['question1'], word_counts, 5) - rare_word_count(x['question2'], word_counts, 5)), axis=1)
X_te_df['rare_word_count'] = X_te_df.apply(lambda x: abs(rare_word_count(x['question1'], word_counts, 5) - rare_word_count(x['question2'], word_counts, 5)), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['diff_syllable_count'] = X_tr_df.apply(lambda x: abs(count_sentence_syllables(x['question1']) - count_sentence_syllables(x['question2'])), axis=1)
X_va_df['diff_syllable_count'] = X_va_df.apply(lambda x: abs(count_sentence_syllables(x['question1']) - count_sentence_syllables(x['question2'])), axis=1)
X_te_df['diff_syllable_count'] = X_te_df.apply(lambda x: abs(count_sentence_syllables(x['question1']) - count_sentence_syllables(x['question2'])), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['diff_flesch_reading_easy'] = X_tr_df.apply(lambda x: abs(Flesch_Reading_Ease(x['question1']) - Flesch_Reading_Ease(x['question2'])), axis=1)
X_va_df['diff_flesch_reading_easy'] = X_va_df.apply(lambda x: abs(Flesch_Reading_Ease(x['question1']) - Flesch_Reading_Ease(x['question2'])), axis=1)
X_te_df['diff_flesch_reading_easy'] = X_te_df.apply(lambda x: abs(Flesch_Reading_Ease(x['question1']) - Flesch_Reading_Ease(x['question2'])), axis=1)

In [None]:
X_tr_df.head()

In [None]:
X_tr_df['diff_flesch_grade_level'] = X_tr_df.apply(lambda x: abs(Flesch_Grade_Level(x['question1']) - Flesch_Grade_Level(x['question2'])), axis=1)
X_va_df['diff_flesch_grade_level'] = X_va_df.apply(lambda x: abs(Flesch_Grade_Level(x['question1']) - Flesch_Grade_Level(x['question2'])), axis=1)
X_te_df['diff_flesch_grade_level'] = X_te_df.apply(lambda x: abs(Flesch_Grade_Level(x['question1']) - Flesch_Grade_Level(x['question2'])), axis=1)

In [None]:
X_tr_df.head()

##### Create question embeddings

In [None]:
import fasttext.util

In [None]:
fasttext.util.download_model('en', if_exists='ignore')  # English
ft_model = fasttext.load_model('cc.en.300.bin')

In [None]:
print('===== Computing fasttext embeddings for training set =====')
X_tr_q1q2 = get_fasttext_embeddings_and_features(X_tr_df, ft_model)
print('===== Computing fasttext embeddings for validation set =====')
X_va_q1q2 = get_fasttext_embeddings_and_features(X_va_df, ft_model)
print('===== Computing fasttext embeddings for test set =====')
X_te_q1q2 = get_fasttext_embeddings_and_features(X_te_df, ft_model)

In [None]:
X_tr_q1q2_base = X_tr_q1q2.drop(['question1', 'question2'],axis = 1)
X_va_q1q2_base = X_va_q1q2.drop(['question1', 'question2'],axis = 1)
X_te_q1q2_base = X_te_q1q2.drop(['question1', 'question2'],axis = 1)

#### Train Model

In [None]:
import time
import xgboost as xgb

start_time = time.time()

xgb_model = xgb.XGBClassifier(random_state=123)
xgb_model.fit(X_tr_q1q2_base.drop(['id','qid1','qid2'],axis = 1), y_tr)

end_time = time.time()

print("Training time:", end_time - start_time, "seconds")

#### Save Model

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")

if not os.path.isdir("model_artifacts/improved_solution_baseline"):
        os.mkdir("model_artifacts/improved_solution_baseline")
        # Save model and validation and test datasets
        with open('model_artifacts/improved_solution_baseline/xgb_model.pkl', 'wb') as file:
            pickle.dump(xgb_model, file)
        with open('model_artifacts/improved_solution_baseline/X_tr_q1q2.pkl', 'wb') as file:
            pickle.dump(X_tr_q1q2_base, file)
        with open('model_artifacts/improved_solution_baseline/y_tr.pkl', 'wb') as file:
            pickle.dump(y_tr, file)
        with open('model_artifacts/improved_solution_baseline/X_va_q1q2.pkl', 'wb') as file:
            pickle.dump(X_va_q1q2_base, file)
        with open('model_artifacts/improved_solution_baseline/y_va.pkl', 'wb') as file:
            pickle.dump(y_va, file)
        with open('model_artifacts/improved_solution_baseline/X_te_q1q2.pkl', 'wb') as file:
            pickle.dump(X_te_q1q2_base, file)
        with open('model_artifacts/improved_solution_baseline/y_te.pkl', 'wb') as file:
            pickle.dump(y_te, file)

<div class="alert" style="padding: 10px;background-color: #6bd0a9; color: white; margin-bottom: 15px; font-size:17px">
Baseline w/ CountVectorizer features
</div>

##### Add count_vectorizer

In [None]:
# Build corpus combining all questions in a list
all_q1 = list(X_tr_q1q2["question1"])
all_q2 = list(X_tr_q1q2["question2"])
all_questions = all_q1 + all_q2

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

In [None]:
X_tr_q1q2_cv = get_countvectorizer_features(X_tr_q1q2, count_vectorizer)
X_va_q1q2_cv = get_countvectorizer_features(X_va_q1q2, count_vectorizer)
X_te_q1q2_cv = get_countvectorizer_features(X_te_q1q2, count_vectorizer)

##### Train Model

In [None]:
import time
import xgboost as xgb

start_time = time.time()

xgb_model_cv = xgb.XGBClassifier(random_state=123)
xgb_model_cv.fit(X_tr_q1q2_cv[:, 3:], y_tr)

end_time = time.time()

print("Training time:", end_time - start_time, "seconds")

##### Save Model

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")

if not os.path.isdir("model_artifacts/improved_solution_baseline_cv"):
        os.mkdir("model_artifacts/improved_solution_baseline_cv")
        # Save model and validation and test datasets
        with open('model_artifacts/improved_solution_baseline_cv/xgb_model.pkl', 'wb') as file:
            pickle.dump(xgb_model_cv, file)
        with open('model_artifacts/improved_solution_baseline_cv/X_tr_q1q2.pkl', 'wb') as file:
            pickle.dump(X_tr_q1q2_cv, file)
        with open('model_artifacts/improved_solution_baseline_cv/y_tr.pkl', 'wb') as file:
            pickle.dump(y_tr, file)
        with open('model_artifacts/improved_solution_baseline_cv/X_va_q1q2.pkl', 'wb') as file:
            pickle.dump(X_va_q1q2_cv, file)
        with open('model_artifacts/improved_solution_baseline_cv/y_va.pkl', 'wb') as file:
            pickle.dump(y_va, file)
        with open('model_artifacts/improved_solution_baseline_cv/X_te_q1q2.pkl', 'wb') as file:
            pickle.dump(X_te_q1q2_cv, file)
        with open('model_artifacts/improved_solution_baseline_cv/y_te.pkl', 'wb') as file:
            pickle.dump(y_te, file)

<div class="alert" style="padding: 10px;background-color: #6bd0a9; color: white; margin-bottom: 15px; font-size:17px">
Baseline w/ TF-IDF features
</div>

##### Add TF-IDF features

In [None]:
!python cython_utils/setup.py build_ext --build-lib=./cython_utils

In [None]:
from cython_utils import tf_idf

In [None]:
# Create sklearn tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(qid_df_preprocess['question'])

In [None]:
X_tr_q1q2_tf = get_tfidf_features(X_tr_q1q2, tf_idf)
X_va_q1q2_tf = get_tfidf_features(X_va_q1q2, tf_idf)
X_te_q1q2_tf = get_tfidf_features(X_te_q1q2, tf_idf)

##### Train Model

In [None]:
import time
import xgboost as xgb

start_time = time.time()

xgb_model_tf = xgb.XGBClassifier(random_state=123)
xgb_model_tf.fit(X_tr_q1q2_tf[:, 3:], y_tr)

end_time = time.time()

print("Training time:", end_time - start_time, "seconds")

##### Save Model

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")

if not os.path.isdir("model_artifacts/improved_solution_baseline_tf"):
        os.mkdir("model_artifacts/improved_solution_baseline_tf")
        # Save model and validation and test datasets
        with open('model_artifacts/improved_solution_baseline_tf/xgb_model.pkl', 'wb') as file:
            pickle.dump(xgb_model_tf, file)
        with open('model_artifacts/improved_solution_baseline_tf/X_tr_q1q2.pkl', 'wb') as file:
            pickle.dump(X_tr_q1q2_tf, file)
        with open('model_artifacts/improved_solution_baseline_tf/y_tr.pkl', 'wb') as file:
            pickle.dump(y_tr, file)
        with open('model_artifacts/improved_solution_baseline_tf/X_va_q1q2.pkl', 'wb') as file:
            pickle.dump(X_va_q1q2_tf, file)
        with open('model_artifacts/improved_solution_baseline_tf/y_va.pkl', 'wb') as file:
            pickle.dump(y_va, file)
        with open('model_artifacts/improved_solution_baseline_tf/X_te_q1q2.pkl', 'wb') as file:
            pickle.dump(X_te_q1q2_tf, file)
        with open('model_artifacts/improved_solution_baseline_tf/y_te.pkl', 'wb') as file:
            pickle.dump(y_te, file)

<div class="alert" style="padding: 20px;background-color: #2cbc84; color: white; margin-bottom: 15px;">
Improved solution with feature selection
</div>

#### Find most important features

In [None]:
# Retrieve the feature importance scores
importance_scores = xgb_model.get_booster().get_score(importance_type='weight')

# Sort the feature importance scores in descending order
sorted_scores = sorted(importance_scores.items(), key=lambda x: x[1], reverse=True)
print(sorted_scores)

#### New dataset only with the most important features

In [None]:
# Select the top n features
n = 300  # set the number of top features you want to select
top_features = [(x[0][0:]) for x in sorted_scores[:n]]
print(top_features)

In [None]:
top_features.append('qid1')
top_features.append('qid2')
top_features.append('id')
# Select only the top features in the training data
X_tr_q1q2_top = X_tr_q1q2[top_features]
# Select only the top features in the validation data
X_va_q1q2_top = X_va_q1q2[top_features]
# Select only the top features in the test data
X_te_q1q2_top = X_te_q1q2[top_features]

<div class="alert" style="padding: 10px;background-color: #6bd0a9; color: white; margin-bottom: 15px; font-size:17px">
w/ XGBoost classifier
</div>

In [None]:
import time
import xgboost as xgb

start_time = time.time()

xgb_model_improve = xgb.XGBClassifier(random_state=123)
xgb_model_improve.fit(X_tr_q1q2_top.drop(['id','qid1','qid2'],axis = 1), y_tr)

end_time = time.time()

print("Training time:", end_time - start_time, "seconds")

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")

if not os.path.isdir("model_artifacts/improved_solution_topfeatures"):
        os.mkdir("model_artifacts/improved_solution_topfeatures")
        # Save model and validation and test datasets
        with open('model_artifacts/improved_solution_topfeatures/xgb_model.pkl', 'wb') as file:
            pickle.dump(xgb_model_improve, file)
        with open('model_artifacts/improved_solution_topfeatures/X_tr_q1q2.pkl', 'wb') as file:
            pickle.dump(X_tr_q1q2_top, file)
        with open('model_artifacts/improved_solution_topfeatures/y_tr.pkl', 'wb') as file:
            pickle.dump(y_tr, file)
        with open('model_artifacts/improved_solution_topfeatures/X_va_q1q2.pkl', 'wb') as file:
            pickle.dump(X_va_q1q2_top, file)
        with open('model_artifacts/improved_solution_topfeatures/y_va.pkl', 'wb') as file:
            pickle.dump(y_va, file)
        with open('model_artifacts/improved_solution_topfeatures/X_te_q1q2.pkl', 'wb') as file:
            pickle.dump(X_te_q1q2_top, file)
        with open('model_artifacts/improved_solution_topfeatures/y_te.pkl', 'wb') as file:
            pickle.dump(y_te, file)

<div class="alert" style="padding: 10px;background-color: #6bd0a9; color: white; margin-bottom: 15px; font-size:17px">
w/ Random Forest classifier
</div>

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()

rf_model = RandomForestClassifier(max_depth = 5, random_state=123)
rf_model.fit(X_tr_q1q2_top.drop(['id','qid1','qid2'],axis = 1), y_tr)

end_time = time.time()

print("Training time:", end_time - start_time, "seconds")

In [None]:
with open('model_artifacts/improved_solution_topfeatures/rf_model.pkl', 'wb') as file:
            pickle.dump(rf_model, file)

<div class="alert" style="padding: 10px;background-color: #6bd0a9; color: white; margin-bottom: 15px; font-size:17px">
w/ Histogram-Based Gradient Boosting classifier
</div>

In [None]:
import time
from sklearn.ensemble import HistGradientBoostingClassifier

start_time = time.time()

hbgd_model = HistGradientBoostingClassifier(max_depth = 20,max_iter = 500,random_state=123)
hbgd_model.fit(X_tr_q1q2_top.drop(['id','qid1','qid2'],axis = 1), y_tr)

end_time = time.time()

print("Training time:", end_time - start_time, "seconds")

In [None]:
with open('model_artifacts/improved_solution_topfeatures/hbgd_model.pkl', 'wb') as file:
            pickle.dump(hbgd_model, file)

<div class="alert" style="padding: 10px;background-color: #6bd0a9; color: white; margin-bottom: 15px; font-size:17px">
w/ Ensembling
XGBoost + HistGradientBoostingClassifier
</div>

In [None]:
from sklearn.ensemble import VotingClassifier

start_time = time.time()

clf1 = xgb.XGBClassifier(random_state=123)
clf3 = HistGradientBoostingClassifier(max_depth = 20,max_iter = 500,random_state=123)

eclf1 = VotingClassifier(estimators=[('xgb', clf1), ('hgbc', clf3)], voting='soft')
eclf1 = eclf1.fit(X_tr_q1q2_top.drop(['id','qid1','qid2'],axis = 1), y_tr)

end_time = time.time()
print("Training time:", end_time - start_time, "seconds")

In [None]:
with open('model_artifacts/improved_solution_topfeatures/eclf1.pkl', 'wb') as file:
            pickle.dump(eclf1, file)