In [1]:
%pylab inline
import timeit
import pandas as pd
import re
import json
import csv
import numpy
import itertools

from keras.preprocessing.text import Tokenizer
from sklearn.metrics.pairwise import cosine_similarity

from __future__ import print_function

from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,log_loss



Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


In [2]:
def get_tokenizer(data):
    
    tokenizer = Tokenizer(lower = True, char_level=False)   
    question_list = data.loc[:,'wordlist_1_clean'].values.tolist()
    question_list += data.loc[:,'wordlist_2_clean'].values.tolist()
    
    question_list = [i.encode('ascii', 'ignore') for i in question_list]
    tokenizer.fit_on_texts(question_list)
    
    word_list = tokenizer.word_index.keys()
    f = open('wordlist_clean_tfidf.txt', 'w')
    for item in word_list:
        f.write("%s\n" % item)
    f.close()
    
    print(len(word_list))

    return tokenizer


def write_sequence(data,tokenizer,question_list):
    wordlist = [i.encode('ascii', 'ignore')  for i in data.loc[:,'wordlist_'+question_list+'_clean'].values.tolist()]  
    sequences = tokenizer.texts_to_matrix(wordlist,mode = "tfidf")
    wordlist = 0
    print('finished sequencing')
    numpy.savetxt('../input/data_q'+question_list+'_clean_tfidf.csv', sequences , delimiter=",",fmt = '%.6e')


def get_tfidf(data): 
   
    start = timeit.default_timer()

    #### Tokenizing Words to sequences.
    data_tokenizer = get_tokenizer(data)   
    current = timeit.default_timer()
    print('Time to tokenize [s]= ',current - start)

    write_sequence(data,data_tokenizer,question_list = '1')
    current = timeit.default_timer()
    print('Time to sequence q1 [s]= ',current - start)

    write_sequence(data,data_tokenizer,question_list = '2')
    current = timeit.default_timer()
    print('Time to sequence q2 [s]= ',current - start)
    
    data =  data.loc[:,['is_duplicate']]
    
    with open('../input/data_q1_clean_tfidf.csv', 'rb') as q1:
        print('q1')
        with open('../input/data_q2_clean_tfidf.csv', 'rb') as q2:
            print('q2')
            reader1 = csv.reader(q1, delimiter=",")
            reader2 = csv.reader(q2, delimiter=",")
            print('readers')
            i = 0
            for line in itertools.izip(reader1,reader2):
                if i%100 == 0:
                    print(i, end=',')
                
                seq_1 = [float(j) for j in line[0]]
                seq_2 = [float(j) for j in line[1]]
                                
                try:
                    cosine = cosine_similarity(np.array(seq_1).reshape(1, -1), np.array(seq_2).reshape(1, -1))
                except:
                    cosine = [[0]]
                data.loc[data.index[0]+i,('cosine_tfidf')] = cosine[0][0] 
                i += 1

                
    data.to_csv('../input/data_clean_tfidf.csv', encoding="utf-8")
    
    
    return


In [3]:
def get_benchmark_results():
    
    data = pd.read_csv('../input/data_clean_tfidf.csv', 
                       nrows = 1)
    column_names = data.columns.tolist()
    print(column_names)
    print(data.loc[0])
    
    data = pd.read_csv('../input/data_clean_tfidf.csv', usecols = ['cosine_tfidf','is_duplicate'])
    data = data.fillna('')
    print(data.loc[0])
    print(len(data[['cosine_tfidf']]))
    
    X_train, X_test, y_train, y_test = train_test_split(data.loc[:,('cosine_tfidf')].values.reshape(-1, 1), data.loc[:,('is_duplicate')], test_size=0.30, random_state=42)
    benchmark_model(X_train, X_test, y_train, y_test)

def benchmark_model(X_train, X_test, y_train, y_test):
    
# fit logistic_regression to find threshold 
# check test data to find accuracy and fscore

    clf = linear_model.LogisticRegression(random_state=42,solver = 'lbfgs',class_weight = 'balanced')

# TODO: Create the parameters list you wish to tune
    parameters = {'C':[1e-18,1e-17,1e-16,1e-15,1e-14,1e-13,1e-12,1e-11,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,10]}

# TODO: Make an f1_score scoring object
    scorer = make_scorer(log_loss,needs_proba = True, greater_is_better = False)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method
    grid_obj = GridSearchCV(clf, parameters,scoring=scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
    grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
    best_clf = grid_fit.best_estimator_

#    print(grid_fit.cv_results_)
    print(grid_fit.best_params_)

# Make predictions using the unoptimized and model
    predictions = (clf.fit(X_train, y_train)).predict(X_test)
    predictions_proba =(clf.fit(X_train, y_train)).predict_proba(X_test)
    best_predictions = best_clf.predict(X_test)
    best_predictions_proba = best_clf.predict_proba(X_test)
    
    print(best_clf.coef_)
    print(best_clf.intercept_)
    print([0.6,0.53,0.52,0.5],best_clf.predict(np.asarray([0.6,0.53,0.52,0.5]).reshape(-1, 1)))
    
 #   print(max(best_predictions))

# Report the before-and-afterscores
    print("Unoptimized model\n------")
    print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("Log-loss on testing data: {:.4f}".format(log_loss(y_test, predictions_proba)))
    print("\nOptimized Model\n------")
    print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("Final Log-loss on the testing data: {:.4f}".format(log_loss(y_test, best_predictions_proba)))  


In [4]:
def main(n=100):

    start = timeit.default_timer()
    
    data = pd.read_csv('../input/data_clean.csv', encoding="utf-8")
    data = data.fillna('')
    data = data.head(n=n).copy()
##    data = data[(data.id >= 105790) & (data.id <= (105790+n))].copy()
    print(data.loc[data.index[0]])
    current = timeit.default_timer()
    print('Time to read data_clean.csv [s]= ',current - start)

    get_tfidf(data)
    current = timeit.default_timer()
    print('Time to get tfidf [s]= ',current - start)

    get_benchmark_results()
    current = timeit.default_timer()
    print('Time to get tfidf benchmark results [s]= ',current - start)

    
main(n=50000)

Unnamed: 0                                                              0
id                                                                      0
qid1                                                                    1
qid2                                                                    2
question1               What is the step by step guide to invest in sh...
question2               What is the step by step guide to invest in sh...
is_duplicate                                                            0
wordlist_1_clean        What is the step by step guide to invest in sh...
len_q1_clean                                                           66
wordlist_1_stopwords            step step guide invest share market india
len_q1_stopwords                                                       41
wordlist_1_stem                  step step guid invest share market india
wordlist_2_clean        What is the step by step guide to invest in sh...
len_q2_clean                          