In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.optimize import minimize
stops = set(stopwords.words("english"))
import xgboost as xgb
from sklearn.cross_validation import train_test_split
import multiprocessing
import difflib
from nltk import SnowballStemmer
from nltk import PorterStemmer

# train = pd.read_csv('../input/train.csv')#[:1000]
# test = pd.read_csv('../input/test.csv')#[:1000]



In [2]:
train = pd.read_pickle('../feature/train_clean.pkl')#[:100]
test = pd.read_pickle('../feature/test_clean.pkl')#[:100]

In [3]:
def tokenize(data):
    return nltk.word_tokenize(data.lower())

In [4]:
stemmer = SnowballStemmer('english')
# stemmer = PorterStemmer()
def stem(data):
    return [stemmer.stem(word) for word in data if word not in stops]

In [5]:
train['clean_q1_tokenized'] = train.clean_q1.apply(tokenize)
train['clean_q2_tokenized'] = train.clean_q2.apply(tokenize)

test['clean_q1_tokenized'] = test.clean_q1.apply(tokenize)
test['clean_q2_tokenized'] = test.clean_q2.apply(tokenize)

In [6]:
train['clean_q1_stem'] = train.clean_q1_tokenized.apply(stem)
train['clean_q2_stem'] = train.clean_q2_tokenized.apply(stem)

test['clean_q1_stem'] = test.clean_q1_tokenized.apply(stem)
test['clean_q2_stem'] = test.clean_q2_tokenized.apply(stem)

In [7]:
train['clean_q1_pos_tagged'] = train.clean_q1_tokenized.apply(lambda x: nltk.pos_tag(x))
train['clean_q2_pos_tagged'] = train.clean_q2_tokenized.apply(lambda x: nltk.pos_tag(x))

test['clean_q1_pos_tagged'] = test.clean_q1_tokenized.apply(lambda x: nltk.pos_tag(x))
test['clean_q2_pos_tagged'] = test.clean_q2_tokenized.apply(lambda x: nltk.pos_tag(x))

In [8]:
from nltk.corpus import wordnet

lemmatizer = nltk.stem.WordNetLemmatizer()

def get_wordnet_pos(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def process_word_pos(word, pos):
    p = get_wordnet_pos(pos)
    if p:
        return lemmatizer.lemmatize(word, p)
    else:
        return word

def lemmatize(pos_tagged):
    return [process_word_pos(word, pos) for word, pos in pos_tagged]

In [9]:
train['clean_q1_lemmatized'] = train.clean_q1_pos_tagged.apply(lemmatize)
train['clean_q2_lemmatized'] = train.clean_q2_pos_tagged.apply(lemmatize)

test['clean_q1_lemmatized'] = test.clean_q1_pos_tagged.apply(lemmatize)
test['clean_q2_lemmatized'] = test.clean_q2_pos_tagged.apply(lemmatize)

In [10]:
train['clean_q1_lemmatized_stem'] = train.clean_q1_lemmatized.apply(stem)
train['clean_q2_lemmatized_stem'] = train.clean_q2_lemmatized.apply(stem)

test['clean_q1_lemmatized_stem'] = test.clean_q1_lemmatized.apply(stem)
test['clean_q2_lemmatized_stem'] = test.clean_q2_lemmatized.apply(stem)

In [11]:
features = [
    'clean_q1_tokenized',
    'clean_q2_tokenized',
    'clean_q1_stem',
    'clean_q2_stem',
    'clean_q1_pos_tagged',
    'clean_q2_pos_tagged',
    'clean_q1_lemmatized',
    'clean_q2_lemmatized',
    'clean_q1_lemmatized_stem',
    'clean_q2_lemmatized_stem'
]

In [12]:
import util
util.save_feature(train, 'train', features, 'id')
util.save_feature(test, 'test', features, 'test_id')