In [241]:
import pandas as pd
from fuzzywuzzy import fuzz, process
import os
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import xgboost

In [242]:
with open("test.csv") as file:
    test_data = pd.read_csv(file)
with open("ltable.csv") as file:
    ltable = pd.read_csv(file)
with open("rtable.csv") as file:
    rtable = pd.read_csv(file)
with open("train.csv") as file:
    train_data = pd.read_csv(file)

def preprocess_lrtables(df, prepend):
#     df = df.drop(columns = ["price"])
    df = df.fillna('')
    df['extended_title'] = df['title'] + ' '+ df['category'] + ' '+ df['brand'] + ' ' + df["modelno"]
#     df = df.drop(columns = ["category", 'brand', 'modelno'])
    c = df.columns
    new_columns = [c[0]]
    for col in c[1:]:
        new_columns.append(prepend + col)
    df.columns = new_columns
    return df

ltable, rtable = preprocess_lrtables(ltable, 'ltable_'), preprocess_lrtables(rtable, 'rtable_')

In [243]:
test_data.head()

Unnamed: 0,ltable_id,rtable_id,id
0,1853,2139,0
1,1718,11835,8
2,1624,5013,9
3,614,5969,10
4,1512,19684,15


In [244]:
def merge_lr(ltable, rtable, pairs, has_labels):
    temp = pairs.merge(ltable, on=['ltable_id'], how='outer')
    lhalf = temp.dropna(subset=['id'])
    temp = lhalf.merge(rtable, on=['rtable_id'], how='outer')
    full = temp.dropna(subset=['id'])
    if has_labels:
        full = full.astype({"ltable_id": int, "rtable_id": int,"label": int, "id": int})
    else:
        full = full.astype({"ltable_id": int, "rtable_id": int, "id": int})
    return full


full = merge_lr(ltable, rtable, train_data, True)
full.head()

Unnamed: 0,ltable_id,rtable_id,label,id,ltable_title,ltable_category,ltable_brand,ltable_modelno,ltable_price,ltable_extended_title,rtable_title,rtable_category,rtable_brand,rtable_modelno,rtable_price,rtable_extended_title
0,621,3167,0,1,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card,electronics - general,zotac,zt-40604-10l,88.88,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card electronics - general zotac zt-40604-10l,evga geforce gts450 superclocked 1 gb gddr5 pci-express 2.0 graphics card 01g-p3-1452-tr,graphics cards,evga,01g-p3-1452-tr,119.88,evga geforce gts450 superclocked 1 gb gddr5 pci-express 2.0 graphics card 01g-p3-1452-tr graphic...
1,1893,3167,0,3477,evga geforce 8400 gs 1gb ddr3 pci express 2.0 graphics card,electronics - general,evga,01g-p3-1302-lr,44.99,evga geforce 8400 gs 1gb ddr3 pci express 2.0 graphics card electronics - general evga 01g-p3-13...,evga geforce gts450 superclocked 1 gb gddr5 pci-express 2.0 graphics card 01g-p3-1452-tr,graphics cards,evga,01g-p3-1452-tr,119.88,evga geforce gts450 superclocked 1 gb gddr5 pci-express 2.0 graphics card 01g-p3-1452-tr graphic...
2,621,13148,0,194,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card,electronics - general,zotac,zt-40604-10l,88.88,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card electronics - general zotac zt-40604-10l,evga geforce 210 512 mb ddr3 pci-express 2.0 graphics card video card 512-p3-1310-lr,graphics cards,evga,512-p3-1310-lr,34.88,evga geforce 210 512 mb ddr3 pci-express 2.0 graphics card video card 512-p3-1310-lr graphics ca...
3,621,698,0,2260,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card,electronics - general,zotac,zt-40604-10l,88.88,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card electronics - general zotac zt-40604-10l,msi geforce 220gt 1024 mb ddr3 pci-express 2.0 graphics card md1gd3 lp,computers accessories,msi,n220gt-md1gd3 / lp,79.21,msi geforce 220gt 1024 mb ddr3 pci-express 2.0 graphics card md1gd3 lp computers accessories msi...
4,621,7701,0,3329,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card,electronics - general,zotac,zt-40604-10l,88.88,zotac geforce gt430 1gb ddr3 pci-express 2.0 graphics card electronics - general zotac zt-40604-10l,pny geforce 9600gt 512 mb ddr3 pci-express graphics card vcg96512gxpb,,pny,vcg96512gxpb,,pny geforce 9600gt 512 mb ddr3 pci-express graphics card vcg96512gxpb pny vcg96512gxpb


In [245]:
def process_merged(merged_table, has_labels):
    def ratio(x):
        return fuzz.ratio(x[0], x[1])

    def partial_ratio(x):
        return fuzz.partial_ratio(x[0], x[1])

    def sort_ratio(x):
        return fuzz.token_sort_ratio(x[0], x[1])

    diff = lambda x: (x[0] - x[1])**2
#     print(merged_table.shape)
    merged_table['title_rat'] = merged_table[['ltable_title', 'rtable_title']].apply(ratio, axis=1)
    merged_table['title_partial_rat'] = merged_table[['ltable_title', 'rtable_title']].apply(partial_ratio, axis=1)
    merged_table['sort'] = merged_table[['ltable_title', 'rtable_title']].apply(sort_ratio, axis=1)
    merged_table['ex_sort'] = merged_table[['ltable_extended_title', 'rtable_extended_title']].apply(sort_ratio, axis=1)
    merged_table['ex_title_rat'] = merged_table[['ltable_extended_title', 'rtable_extended_title']].apply(ratio, axis=1)
    merged_table['ex_title_partial_rat'] = merged_table[['ltable_extended_title', 'rtable_extended_title']].apply(partial_ratio, axis=1)
    merged_table['modelno'] = merged_table[['ltable_modelno', 'rtable_modelno']].apply(ratio, axis=1)
    merged_table['brand'] = merged_table[['ltable_brand', 'rtable_brand']].apply(partial_ratio, axis=1)
    # merged_table['price'] = merged_table[['ltable_price', 'rtable_price']].apply(diff, axis=1)
    if has_labels:
        return merged_table[['id','label', 'title_rat', 'title_partial_rat', 'sort','ex_title_rat','ex_title_partial_rat','ex_sort', 'modelno', 'brand']]
    else:
        return merged_table[['id', 'title_rat', 'title_partial_rat', 'sort','ex_title_rat','ex_title_partial_rat','ex_sort', 'modelno', 'brand']]

final = process_merged(full, True)
final.head()

Unnamed: 0,id,label,title_rat,title_partial_rat,sort,ex_title_rat,ex_title_partial_rat,ex_sort,modelno,brand
0,1,0,71,74,67,59,62,59,23,25
1,3477,0,73,71,67,71,65,65,79,100
2,194,0,69,84,68,61,63,56,38,25
3,2260,0,77,84,73,60,62,50,27,0
4,3329,0,71,78,69,58,60,54,17,0


In [246]:
def entire_preprocess(ltable_path, rtable_path, pairs_path, has_labels):
    ## Read files
    print('Opening...', end='\t\t')
    with open(pairs_path) as file:
        pairs = pd.read_csv(file)
    with open(ltable_path) as file:
        ltable = pd.read_csv(file)
    with open(rtable_path) as file:
        rtable = pd.read_csv(file)
    print('Finished')
    
    ## preprocess the L and R tables before dealing with them
    print('Process and merging...', end='\t\t')    
    ltable, rtable = preprocess_lrtables(ltable, 'ltable_'), preprocess_lrtables(rtable, 'rtable_')
    # merge the tables together with the pair information
    full = merge_lr(ltable, rtable, pairs, has_labels)
    print('Finished')
    print('Merged shape: '+repr(full.shape))
    
    
    print('Creating meaningful data...', end='\t\t')
    # process the full table to create meaningful data
    final = process_merged(full, has_labels)
    print('Finished')
    print('Final shape: '+repr(final.shape))
    
    return final

In [247]:
train_args = ['ltable.csv', 'rtable.csv', 'train.csv', True]
test_args = ['ltable.csv', 'rtable.csv', 'test.csv', False]

In [248]:
# import training data
training_data = entire_preprocess(*train_args)

# split training data between vector and labels
all_y = training_data.iloc[:,1]
all_x = training_data.iloc[:,2:]

# # use 20% of training data to test accuracy of training
train_x, test_x, train_y, test_y = train_test_split(all_x,all_y,test_size=.2)

Opening...		Finished
Process and merging...		Finished
Merged shape: (7149, 16)
Creating meaningful data...		Finished
Final shape: (7149, 10)


In [249]:
print(all_x.columns)

Index(['title_rat', 'title_partial_rat', 'sort', 'ex_title_rat',
       'ex_title_partial_rat', 'ex_sort', 'modelno', 'brand'],
      dtype='object')


In [250]:
# train model
xgb =  XGBClassifier()
xgb.fit(train_x, train_y)
print ('Accuracy XGBoost: ' + str(accuracy_score(test_y, xgb.predict(test_x))))

Accuracy XGBoost: 0.9538461538461539


  if diff:


In [251]:
# import test data
test_data = entire_preprocess(*test_args)
test_data.head()
x_vals = test_data.iloc[:,1:]
ids
x_vals.head()

# make predictions
predictions = xgb.predict(x_vals)
ids = test_data.iloc[:,0]
submit = pd.DataFrame({'id':ids, 'label':predictions})
submit.to_csv('submission.csv', index=False)
submit.head(10)

Opening...		Finished
Process and merging...		Finished
Merged shape: (3093, 15)
Creating meaningful data...		Finished
Final shape: (3093, 9)


  if diff:


Unnamed: 0,id,label
0,0,0
1,6101,0
2,1187,0
3,7222,0
4,5506,0
5,10237,0
6,8442,0
7,3558,0
8,8,0
9,6363,0


In [252]:
submit.shape

(3093, 2)

In [253]:
final.shape

(7149, 10)

In [254]:
test_data.shape

(3093, 9)