In [1]:
%matplotlib inline

In [2]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVR, LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer

In [5]:
from numpy.random import seed
seed(1)

In [6]:
data_dir = './'

In [7]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# data_dir = '/content/gdrive/My Drive/ML colab datasets/crowdflower-search-relevance'

# !cp -v '{data_dir}/kappa_intuition.py' ./


# !mkdir -p /root/.kaggle
# !cp -v '{data_dir}/kaggle.json' /root/.kaggle/

In [8]:
from kappa_intuition import quadratic_weighted_kappa

In [9]:
!ls '{data_dir}'

Makefile                   submission.csv
[34m__pycache__[m[m                test.csv
glove.6B.100d.txt          tfidf_voting.ipynb
kaggle.json                train.csv
kappa_intuition.py         [34mvenv[m[m
requirements.txt           word_lstm_multi_head.ipynb
sampleSubmission.csv


In [10]:
# from google.colab import files
# uploaded = files.upload()

In [11]:
pipeline_path = data_dir + 'pipeline.pkl'

In [12]:
train_file = f'{data_dir}/train.csv'
df = pd.read_csv(train_file)
df.fillna('', inplace=True)
df = shuffle(df)
df.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
9427,30313,digital camera,FUJIFILM Black FinePix S8630 16 MP 36x Optical...,The FinePix S8600 bridge camera features a pow...,4,0.4
2378,7576,baseball cleats,Easton Women's Redline Low Softball Cleat,Easton incorporates leading technology and des...,3,0.98
8293,26717,girls halloween costumes,Dress Up America Girls' 'Japanese Girl' Costume,This Japanese Girl costume from Dress Up Ameri...,4,0.471
9042,29066,car jump starter,Stanley 500 AMP/1000 PEAK AMP Battery Jump Sta...,Don't get stranded. This 500 instant/1000 peak...,3,0.471
1024,3315,micro usb to hdmi,Insten Micro USB to HDMI MHL Adapter + AC Char...,,4,0.0


In [13]:
import nltk
nltk.download('book')
from nltk import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Error loading book: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:852)>


In [15]:
stemmer = PorterStemmer()

In [16]:
def loadGloveModel(glove_file):
    print("Loading Glove Model")
    model = {}
    with open(glove_file,'r') as f:
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [17]:
glove_file = f'{data_dir}/glove.6B.100d.txt'
try:
    glove_model.keys()
except:
    glove_model = loadGloveModel(glove_file)

Loading Glove Model
Done. 400000  words loaded!


In [18]:
def preprocess(text):
    text = text.replace('\\n', '')
    text = text.replace('\\t', ' ')
    text = text.lower().strip()
    # soup = BeautifulSoup(text)
    # text = soup.get_text()
    # text = ' '.join([stemmer.stem(w) for w in word_tokenize(text)])
    return text

In [19]:
def get_num_words(text):
    text = preprocess(text)
    return len(text)

In [20]:
def get_lengths(df):
    return list(map(get_num_words, df))

In [21]:
def get_data(df):
    ids = list(df['id'])
    queries = list(df['query'])
    titles = list(df['product_title'])
    # descs = list(df['product_description'])
    outputs = list(df['median_relevance'])
    return queries, titles, outputs

In [22]:
queries, titles, outputs = get_data(df)
queries = list(map(preprocess, queries))
titles = list(map(preprocess, titles))
all_text = queries + titles
len(all_text)

20316

In [23]:
len(queries), len(titles), len(outputs)

(10158, 10158, 10158)

In [24]:
tfidf_w = TfidfVectorizer(lowercase=True, analyzer='word', max_features=10000, min_df=3, ngram_range=(1,2), stop_words='english')
tfidf_w.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=3, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [25]:
# CountVectorizer?

In [26]:
count_vec = CountVectorizer(lowercase=True, analyzer='word', max_features=10000, min_df=3, stop_words='english')
count_vec.fit(all_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=10000, min_df=3,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [44]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=1)

In [45]:
def process_df(df):
    cols = ['query', 'product_title']
    X = df[cols].to_dict('list')
    for k in X:
        X[k] = list(map(preprocess, X[k]))
    return pd.DataFrame(data=X, columns=cols)

In [46]:
X_train = process_df(train_df)
X_valid = process_df(valid_df)

In [47]:
# X_train = train_df[['query', 'product_title']]
# X_valid = valid_df[['query', 'product_title']]

In [48]:
y_train = train_df['median_relevance'].values
y_valid = valid_df['median_relevance'].values

In [49]:
def batch_cosine_sim(a, b):
    assert a.shape == b.shape
    assert len(a.shape) == 2
    norm_a = np.linalg.norm(a, axis=1) + 1e-6
    norm_b = np.linalg.norm(b, axis=1) + 1e-6
    c = np.multiply(a, b)
    c = np.sum(c, axis=1)
    c = c / (norm_a * norm_b)
    return c

In [50]:
# ax = plt.scatter(x=dist1[:200], y=dist2[:200], c=y_train[:200])
# # plt.legend(labels=[1, 2, 3, 4])
# plt.show()

In [51]:
def get_bowv(text):
    shape = glove_model['the'].shape
    vecs = []
    for w in text.split():
        if w in glove_model:
            v = glove_model[w]
        else:
            v = np.random.uniform(-0.05, 0.05, shape)
        vecs.append(v)
    vecs = np.array(vecs)
    return np.mean(vecs, axis=0)

In [52]:
def add_comparision_feats(df):
    df2 = df.copy()
    a = tfidf_w.transform(df['query'].values).toarray()
    b = tfidf_w.transform(df['product_title'].values).toarray()
    tfidf_sim = batch_cosine_sim(a, b)

    c = count_vec.transform(df['query'].values).toarray()
    d = count_vec.transform(df['product_title'].values).toarray()
    count_sim = batch_cosine_sim(c, d)

    glove_query = np.array(list(map(get_bowv, df['query'].values)))
    glove_title = np.array(list(map(get_bowv, df['product_title'].values)))
    glove_sim = batch_cosine_sim(glove_query, glove_title)

    # ax = plt.scatter(x=tfidf_sim[:200], y=glove_sim[:200], c=y_train[:200])
    # plt.show()
    
    q_char_len = np.array(list(map(len, df['query'].values)))
    t_char_len = np.array(list(map(len, df['product_title'].values))) + 1
    char_len_ratio = q_char_len/t_char_len

    # df2['glove_query'] = glove_query
    # df2['glove_title'] = glove_title
    df2['tfidf_sim'] = tfidf_sim
    df2['count_sim'] = count_sim
    df2['glove_sim'] = glove_sim
    df2['char_len_ratio'] = char_len_ratio
    return df2

In [53]:
X_train_2 = add_comparision_feats(X_train)
X_valid_2 = add_comparision_feats(X_valid)

In [54]:
# X_train_2.drop('glove_sim', axis='columns', inplace=True)
# X_valid_2.drop('glove_sim', axis='columns', inplace=True)
X_valid_2.head()

Unnamed: 0,query,product_title,tfidf_sim,count_sim,glove_sim,char_len_ratio
0,double stroller,foundations lx6 6-passenger stroller,0.208738,0.408248,0.550574,0.405405
1,levis 505,levi's vintage clothing customized 505 jeans -...,0.178228,0.288675,0.166997,0.157895
2,brett favre ny titans jersey blue,ny jets childs reebok jersey #4 brett favre gr...,0.347981,0.492366,0.840454,0.471429
3,tv,vizio e320i-b2 32-inch 720p 60hz full-array sm...,0.0,0.0,0.559279,0.025316
4,green bay packers,green bay packers helmet clock,0.790676,0.774596,0.902818,0.548387


In [55]:
col_trans = ColumnTransformer(
    [
     ('query_tfidf', tfidf_w, 'query'),
    #  ('query_count', count_vec, 'query'),
     ('title_tfidf', tfidf_w, 'product_title'),
    #  ('title_count', count_vec, 'product_title'),
     ],
     remainder='passthrough'
)

In [56]:
lin_svc = LinearSVC(class_weight='balanced')
knn1 = KNeighborsClassifier(3)
knn2 = KNeighborsClassifier(5)
knn3 = KNeighborsClassifier(7)
rid = RidgeClassifier(class_weight='balanced')
ada = AdaBoostClassifier(n_estimators=25)
lr = LogisticRegression(n_jobs=-1, multi_class='ovr', solver='lbfgs', class_weight='balanced')
rf = RandomForestClassifier(n_estimators=25, n_jobs=-1, max_features='log2', max_depth=None, min_samples_leaf=1, class_weight='balanced')
et = ExtraTreesClassifier(n_estimators=25, n_jobs=-1, max_features='log2', max_depth=None, min_samples_leaf=1, class_weight='balanced')
bg_svc = BaggingClassifier(base_estimator=LinearSVC(class_weight='balanced'), max_samples=0.5, 
                           max_features=0.5, n_estimators=25, n_jobs=-1)
bg_lr = BaggingClassifier(base_estimator=LogisticRegression(n_jobs=-1, multi_class='ovr', solver='lbfgs', class_weight='balanced'), 
                          max_samples=0.5, max_features=0.5, n_estimators=25, n_jobs=-1)

In [57]:
estimators = [
            ('lr', lr),
            ('lin_svc', lin_svc),
            # ('knn1', knn1),
            # ('knn2', knn2),
            # ('knn3', knn3),
            # ('rf', rf),
            # ('et', et),
            ('bg_svc', bg_svc),
            ('bg_lr', bg_lr)
            ]
for i in range(len(estimators)):
    print(estimators[i][0])
    # clfr = VotingClassifier(estimators=est2, n_jobs=-1, voting='hard')
    clfr = estimators[i][1]
    # print(clfr)
    pipe = Pipeline([
                     ('col_trans', col_trans),
                     ('clfr', clfr)
                     ])
    _ = pipe.fit(X=X_train_2, y=y_train)
    y_preds = pipe.predict(X_valid_2)
    print(quadratic_weighted_kappa(y_valid, y_preds))
    print('\n')

lr
0.6098876271187227


lin_svc
0.6190645250588552


bg_svc
0.6283602431931831


bg_lr
0.600860634563839




In [58]:
clfr = VotingClassifier(estimators=[
                                    ('lr', lr),
                                    # ('lin_svc', lin_svc),
                                    ('knn1', knn1),
                                    ('knn2', knn2),
                                    ('knn3', knn3),
                                    ('rf', rf),
                                    ('et', et),
                                    ('bg_svc', bg_svc),
                                    ('bg_lr', bg_lr)
                                    ], 
                        n_jobs=-1, voting='soft')

In [59]:
pipeline = Pipeline([
                     ('col_trans', col_trans),
                     ('clfr', clfr)
                     ])

In [60]:
%time _ = pipeline.fit(X=X_train_2, y=y_train)

CPU times: user 526 ms, sys: 244 ms, total: 770 ms
Wall time: 10.3 s


In [68]:
y_preds = pipeline.predict(X_valid_2)
print(y_preds)
quadratic_weighted_kappa(y_valid, y_preds)

[4 4 2 ... 4 2 4]


0.6461181016862544

In [69]:
print(confusion_matrix(y_valid, y_preds))

[[  95   42   14   23]
 [  27  118   42  106]
 [  13   52   83  198]
 [   5   45   71 1098]]


In [None]:
# y_preds = pipeline.predict(X_train)
# print(y_preds)
# quadratic_weighted_kappa(y_train, y_preds)

In [62]:
test_file = f'{data_dir}/test.csv'
test_df = pd.read_csv(test_file)
test_df.fillna('', inplace=True)
X_test = process_df(test_df)
ids = list(test_df['id'])
X_test_2 = add_comparision_feats(X_test)

In [63]:
y_preds = pipeline.predict(X_test_2)
submission = pd.DataFrame({"id": ids, "prediction": y_preds})
submission.to_csv("submission.csv", index=False)

In [65]:
!kaggle competitions submit -c crowdflower-search-relevance -f submission.csv -m "feat eng voting 3"

100%|████████████████████████████████████████| 168k/168k [00:13<00:00, 12.7kB/s]
Successfully submitted to Crowdflower Search Results Relevance