In [4]:
import os
os.chdir('/home/ubuntu/search_relevance')
os.getcwd()

'/home/ubuntu/search_relevance'

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search

In [6]:

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

In [7]:
def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

In [8]:
def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [9]:
#load the test & training file
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
#We don't need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [11]:
train.head()

Unnamed: 0,query,product_title,product_description,median_relevance,relevance_variance
0,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,wine rack,Concept Housewares WR-44526 Solid-Wood Ceiling...,"Like a silent and sturdy tree, the Southern En...",4,0.0
4,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,"WTGR1011\nFeatures\nNickel base, 60,000 averag...",2,0.471


In [12]:
train.shape[:]

(10158, 5)

In [13]:
test.head()

Unnamed: 0,query,product_title,product_description
0,electric griddle,Star-Max 48 in Electric Griddle,
1,phillips coffee maker,Philips SENSEO HD7810 WHITE Single Serve Pod C...,
2,san francisco 49ers,2013 San Francisco 49ers Clock,A 2013 San Francisco 49ers clock is the ultima...
3,aveeno shampoo,AVEENO 10.5FLOZ NRSH SHINE SH,"Water, Ammonium Lauryl Sulfate, Dimethicone, S..."
4,flea and tick control for dogs,Merial Frontline Plus Flea and Tick Control fo...,


In [10]:
test.shape[:]

(22513, 3)

In [14]:
# create labels. drop useless columns
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

In [15]:
# do some lambda magic on text columns
traindata = list(train.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))
testdata = list(test.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))

In [16]:
traindata[:3]

['bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a heart in black thread. 8" x 8".',
 'led christmas lights Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bulbs with matching train light covers / clear wire Multi-color consists of red, green, blue and yellow bulbs Number of bulbs on string: 10 Bulb size: micro LED Spacing between bulbs: 6 inches Lighted length: 4.5 feet Total length: 5.5 feet 12 inch lead cord Additional product features: LED lights use 90% less energy Cool to the touch If one bulb burns out, the rest will stay lit Lights are equipped with Lamp Lock feature, which makes them replaceable, interchangeable and keeps them from falling out Requires 3 "AA" batteries (not included) Convenient on/off/timer switch located on battery pack Timer function on battery pack allows for 6 hours on and 18 h

In [17]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

In [18]:
# Fit TFIDF
tfv.fit(traindata)
X =  tfv.transform(traindata) 
X_test = tfv.transform(testdata)

In [19]:
#initialize SVD
svd = TruncatedSVD()

In [20]:
# initialize the standardscaler
scl = StandardScaler()

In [21]:
# We will use svm here
svm_model =SVC()

In [22]:
# Create the pipeline
clf = pipeline.Pipeline([('svd', svd),
                        ('scl', scl),
                        ('svm', svm_model)])

In [23]:
#kappa Scorer
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [24]:
# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [120, 130],
              'svm__C': [1.0, 15]}

In [25]:
# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=5)

In [26]:
#Fit Grid Search Model
model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
	print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done   2 jobs       | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done   5 jobs       | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done   8 jobs       | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done  13 jobs       | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed:  1.9min remaining:   49.2s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  2.4min remaining:   25.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.4min finished


Best score: 0.533
Best parameters set:
	svd__n_components: 130
	svm__C: 15
[CV] svd__n_components=120, svm__C=1.0 ...............................
[CV] svd__n_components=120, svm__C=1.0 ...............................
[CV] svd__n_components=120, svm__C=1.0 ...............................
[CV] svd__n_components=120, svm__C=1.0 ...............................
[CV] ...... svd__n_components=120, svm__C=1.0, score=0.129182 -  27.8s[CV] ...... svd__n_components=120, svm__C=1.0, score=0.098930 -  27.8s[CV] ...... svd__n_components=120, svm__C=1.0, score=0.130541 -  27.7s[CV] ...... svd__n_components=120, svm__C=1.0, score=0.112231 -  27.6s



[CV] svd__n_components=120, svm__C=15 ................................
[CV] svd__n_components=120, svm__C=15 ................................
[CV] svd__n_components=120, svm__C=15 ................................
[CV] svd__n_components=120, svm__C=1.0 ...............................
[CV] ....... svd__n_components=120, svm__C=15, score=0.494611 -  29.8s[CV

In [28]:
# Get Best Model
best_model = model.best_estimator_
best_model

Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=130, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=15, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [29]:
best_model.fit(X,y)
preds= best_model.predict(X_test)

In [30]:
#create submission file
submission = pd.DataFrame({"id":idx, "prediction":preds})
submission.to_csv("may_19th_1st.csv",index=False)