In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.optimize import minimize
stops = set(stopwords.words("english"))
import xgboost as xgb
from sklearn.cross_validation import train_test_split
import multiprocessing
import difflib

train = pd.read_csv('../input/train.csv')#[:1000]
test = pd.read_csv('../input/test.csv')#[:1000]



In [2]:
def tokenize(data):
    return nltk.word_tokenize(str(data).lower())

In [3]:
train['q1_tokenized'] = train.question1.apply(tokenize)
train['q2_tokenized'] = train.question2.apply(tokenize)

test['q1_tokenized'] = test.question1.apply(tokenize)
test['q2_tokenized'] = test.question2.apply(tokenize)

In [4]:
train['q1_pos_tagged'] = train.q1_tokenized.apply(lambda x: nltk.pos_tag(x))
train['q2_pos_tagged'] = train.q2_tokenized.apply(lambda x: nltk.pos_tag(x))

test['q1_pos_tagged'] = test.q1_tokenized.apply(lambda x: nltk.pos_tag(x))
test['q2_pos_tagged'] = test.q2_tokenized.apply(lambda x: nltk.pos_tag(x))

In [5]:
from nltk import stem
from nltk.corpus import wordnet

lemmatizer = stem.WordNetLemmatizer()

def get_wordnet_pos(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def process_word_pos(word, pos):
    p = get_wordnet_pos(pos)
    if p:
        return lemmatizer.lemmatize(word, p)
    else:
        return word

def lemmatize(pos_tagged):
    return [process_word_pos(word, pos) for word, pos in pos_tagged]

In [6]:
train['q1_lemmatized'] = train.q1_pos_tagged.apply(lemmatize)
train['q2_lemmatized'] = train.q2_pos_tagged.apply(lemmatize)

test['q1_lemmatized'] = test.q1_pos_tagged.apply(lemmatize)
test['q2_lemmatized'] = test.q2_pos_tagged.apply(lemmatize)

In [7]:
features = [
    'q1_tokenized',
    'q2_tokenized',
    'q1_pos_tagged',
    'q2_pos_tagged',
    'q1_lemmatized',
    'q2_lemmatized'
]

In [8]:
import util
util.save_feature(train, 'train', features, 'id')
util.save_feature(test, 'test', features, 'test_id')