In [1]:
# Appending the fuzzywuzzy features here for the test dataset, along with the basic features

In [2]:
import numpy as np
import pandas as pd
import sklearn
%matplotlib inline
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from joblib import Parallel, delayed
import multiprocessing
import itertools



## Test Data Features : Basic + FuzzyWuzzy

In [3]:
test_text_data = pd.read_csv('test_text_data.csv',sep=',')

In [4]:
test_text_data.count()

test_id           2345796
question1_form    2333113
question2_form    2333117
dtype: int64

In [5]:
test_text_data.question1_form[(test_text_data['question1_form'].isnull())]=''
test_text_data.question2_form[(test_text_data['question2_form'].isnull())]=''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [6]:
test_text_data.count()

test_id           2345796
question1_form    2345796
question2_form    2345796
dtype: int64

In [7]:
# Loading Test Dataset files with basic features

In [8]:
test_data_cols1 = pd.read_csv('test_data_cols1.csv', sep=',')
test_data_cols2 = pd.read_csv('test_data_cols2.csv', sep=',', skiprows=1, header=None)

# Creating DF with features and duplicate variable only for final model
def append_df(df1, df2):
    # df1 with column names, and df2 without
    df2.columns = df1.columns
    df1 = df1.append(df2, ignore_index=True)
    return df1

test_data_cols1 = append_df(test_data_cols1,test_data_cols2) 

In [9]:
# Making a check on the values
test_data_cols1.count()

test_id            2345796
q1_form_len        2345796
q2_form_len        2345796
q1_length          2345796
q2_length          2345796
q1_unique          2345796
q2_unique          2345796
q1_form_uni        2345796
q2_form_uni        2345796
q1_form_char       2345796
q2_form_char       2345796
q1_q2_char_diff    2345796
common_cnt         2345796
prcnt_common       2345379
dtype: int64

In [10]:
# Imputing if any :
test_data_cols1['prcnt_common'].fillna(test_data_cols1['prcnt_common'].mean(), inplace=True)

In [11]:
# Merging both the test datasets and making final file
test_data = pd.merge(test_text_data, test_data_cols1, left_on=['test_id'],right_on=['test_id'],how='inner')
del test_text_data, test_data_cols1

In [12]:
# Fuzzywuzz features :
def fuzzy_ratios(input_df):
    input_df['partial_ratio'] = input_df.apply(lambda row: fuzz.partial_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['partial_token_set_ratio'] = input_df.apply(lambda row: fuzz.partial_token_set_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['partial_token_sort_ratio'] = input_df.apply(lambda row: fuzz.partial_token_sort_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['q_ratio'] = input_df.apply(lambda row: fuzz.QRatio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['token_set_ratio'] = input_df.apply(lambda row: fuzz.token_set_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['token_sort_ratio'] = input_df.apply(lambda row: fuzz.token_sort_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['w_ratio'] = input_df.apply(lambda row: fuzz.WRatio(row[u'question1_form'], row[u'question2_form']), axis=1)
    return input_df

In [13]:
test_data = fuzzy_ratios(test_data)

In [15]:
test_data.columns

Index([u'test_id', u'question1_form', u'question2_form', u'q1_form_len',
       u'q2_form_len', u'q1_length', u'q2_length', u'q1_unique', u'q2_unique',
       u'q1_form_uni', u'q2_form_uni', u'q1_form_char', u'q2_form_char',
       u'q1_q2_char_diff', u'common_cnt', u'prcnt_common', u'partial_ratio',
       u'partial_token_set_ratio', u'partial_token_sort_ratio', u'q_ratio',
       u'token_set_ratio', u'token_sort_ratio', u'w_ratio'],
      dtype='object')

In [14]:
test_data.to_csv('test_basic_fuzzy.csv', index=False, sep=',')

In [17]:
for i in [1,2,3,4,5,6]:
    vars()['test_basic_fuzzy'+str(i)] = test_data.iloc[((test_data.shape[0]/6)*(i-1)):((test_data.shape[0]/6)*i)]
    vars()['test_basic_fuzzy'+str(i)].to_csv('test_basic_fuzzy'+str(i)+'.csv', sep=',', index=False)
    del vars()['test_basic_fuzzy'+str(i)]

In [11]:
# Not Used : For Multiprocessing
lst = range(1,13)
id_col_grp = list(itertools.chain.from_iterable(itertools.repeat(x, test_data.shape[0]/12) for x in lst))
test_data['id_col_grp']=id_col_grp

# Fuzzywuzz features :
def fuzzy_ratios1(input_df):
    input_df['partial_ratio'] = input_df.apply(lambda row: fuzz.partial_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['partial_token_set_ratio'] = input_df.apply(lambda row: fuzz.partial_token_set_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['partial_token_sort_ratio'] = input_df.apply(lambda row: fuzz.partial_token_sort_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['q_ratio'] = input_df.apply(lambda row: fuzz.QRatio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['token_set_ratio'] = input_df.apply(lambda row: fuzz.token_set_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['token_sort_ratio'] = input_df.apply(lambda row: fuzz.token_sort_ratio(row[u'question1_form'], row[u'question2_form']), axis=1)
    input_df['w_ratio'] = input_df.apply(lambda row: fuzz.WRatio(row[u'question1_form'], row[u'question2_form']), axis=1)
    
    grp_name = list(set(input_df['id_col_grp']))[0]
    grp_filename = 'C:/Users/Palash Goyal/Downloads/Quora_Duplicate/home/test_basic_fuzzy%d.csv'%(grp_name)                        # file name
    print "records for id_col_grp %s stored at %s"%( grp_name,grp_filename)       # printing the entries
    del input_df['id_col_grp']                                          # This will remove the column of id_col_grp
    input_df.to_csv(grp_filename, index=False, sep=',')
    return

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return #pd.concat(retLst)

applyParallel(test_data.groupby('id_col_grp'), fuzzy_ratios1)

In [12]:
# Saving only the basic and fuzzy features, along with the ID
# test_basic_fuzzy = pd.merge(test_data, fuzzy_feats, left_on=['test_id','question1_form','question2_form'],
#                     right_on=['test_id','question1_form','question2_form'],how='inner')