In [36]:
import pandas as pd
import numpy as np
import re

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(len(train))
print(len(test))

404290
2345796


In [37]:
train = train[:50000]
test = test[:5000]

In [38]:
label = train.is_duplicate

In [39]:
from sklearn.cross_validation import train_test_split 

train, test, label_train, label_test = train_test_split(train, label, random_state = 0, test_size=0.1)

print(len(train), len(test))   
print(len(label_train), len(label_test)) 

45000 5000
45000 5000


In [40]:
def clean(text):
    r = re.sub('[-=.#/:$)(){}]', '', text)
    return r.lower() #소문자로 바꿔서 리턴

In [41]:
question1 = list(map(clean,train["question1"]))
question2 = list(map(clean,train["question2"]))
test_question1 = list(map(clean,test["question1"]))
test_question2 = list(map(clean,test["question2"]))

In [42]:
#nltk stopwords 에러떠서 일단 안함
import nltk
from nltk.corpus import stopwords
# nltk.download("stopwords")
stopwords = set(stopwords.words('english'))

def remove_stopwords(text):
    text = [t for t in text.split() if t not in stopwords]
    return ' '.join(text)

In [43]:
question1 = list(map(remove_stopwords, question1))
question2 = list(map(remove_stopwords, question2))
test_question1 = list(map(remove_stopwords, test_question1))
test_question2 = list(map(remove_stopwords, test_question2))

In [44]:
train["question1"] = question1
train["question2"] = question2
test["question1"] = test_question1
test["question2"] = test_question2

In [45]:
train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist()).astype(str)
test_qs = pd.Series(test['question1'].tolist() + test['question2'].tolist()).astype(str)

In [5]:
#stemmer = nltk.stem.PorterStemmer()
#words = [stemmer.stem(w) for w in words]

In [46]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).split():
        q1words[word] = 1
    for word in str(row['question2']).split():
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

train_word_match = train.apply(word_match_share, axis=1, raw=True)

In [47]:
from collections import Counter

# 단어가 자주 등장할 수록 작은 가중치 
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
words = (" ".join(train_qs)).split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [48]:
print("단어 수 :", len(counts))

단어 수 : 53266


In [49]:
word = counts.keys()

In [50]:
# tf-idf 가중치
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).split():
        q1words[word] = 1
    for word in str(row['question2']).lower().split():
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

tfidf_train_word_match = train.apply(tfidf_word_match_share, axis=1, raw=True)

  from ipykernel import kernelapp as app


In [51]:
from gensim.models import Word2Vec
#model = Word2Vec(word, size=300, min_count=0, workers=4, sg=1)

In [52]:
from gensim.models.keyedvectors import KeyedVectors
#model.wv.save_word2vec_format('quora_w2v', binary=False)
model = KeyedVectors.load_word2vec_format('quora_w2v', binary=False, encoding='utf-8')

In [53]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

class TfidfEmbeddingVectorizer:
    def __init__(self, word2vec):
        self.word2vec = word2vec
        
    def transform(self, X):
        tfidf = TfidfVectorizer(analyzer = lambda x : x) 
        tfidf.fit(X)
        max_idf = max(tfidf.idf_) 
        word2weight = defaultdict(lambda : max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) 
        
        array_list =[]
        for words in X:
            array_list.append(np.array(np.mean([self.word2vec[w]*word2weight[w] for w in words if w in self.word2vec] or [np.zeros(300)], axis = 0)))
        return(array_list)

In [55]:
vec_tf = TfidfEmbeddingVectorizer(w2v)
question1_tf = vec_tf.transform(question1)
question2_tf = vec_tf.transform(question2)
test_question1_tf = vec_tf.transform(test_question1)
test_question2_tf = vec_tf.transform(test_question2)

In [56]:
x_train = pd.DataFrame()
x_test = pd.DataFrame()
x_train['word_match'] = train_word_match
x_train['tfidf_word_match'] = tfidf_train_word_match
x_test['word_match'] = test.apply(word_match_share, axis=1, raw=True)
x_test['tfidf_word_match'] = test.apply(tfidf_word_match_share, axis=1, raw=True)

y_train = train['is_duplicate'].values

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


In [57]:
x_train = [np.mean([question1_tf[i], question2_tf[i]], axis=0) for i in range(len(question1_tf))]
x_test = [np.mean([test_question1_tf[i], test_question2_tf[i]], axis=0) for i in range(len(test_question1_tf))]

In [58]:
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

In [59]:
y_train = train.is_duplicate

In [60]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

  """Entry point for launching an IPython kernel.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

In [30]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

ModuleNotFoundError: No module named 'xgboost'

In [121]:
d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('simple_xgb.csv', index=False)

NameError: name 'xgb' is not defined

In [None]:
for i in range(len(sub)):
    if sub.is_duplicate[i]>0.5:
        sub.is_duplicate[i] = 1
    else:
        sub.is_duplicate[i] = 0

acc = sum(sub.is_duplicate == questions_test.is_duplicate) / len(sub)
acc

In [None]:
import lightgbm as lgb

In [None]:
x_gbm = lgb.Dataset(x_train, label=y_train)

In [None]:
param = {}
    param['learning_rate'] = 0.14
    param['boosting_type'] = 'dart'
    param['objective'] = 'binary'
    param['metric'] = 'binary_logloss'
    param['sub_feature'] = 0.5
    param['num_leaves'] = 512
    param['min_data'] = 50
    param['min_hessian'] = 1

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
rf_pred = clf.predict(x_test)
rf_acc = sum(rf_pred==test.is_duplicate) / len(rf_pred)
rf_acc

AttributeError: 'DataFrame' object has no attribute 'is_duplicate'

Unnamed: 0,test_id,question1,question2
0,0,surface pro 4 compare ipad pro?,microsoft choose core m3 core i3 home surface ...
1,1,hair transplant age 24? much would cost?,much cost hair transplant require?
2,2,best way send money china us?,send money china?
3,3,food emulsifiers?,foods fibre?
4,4,"""aberystwyth"" start reading?",start reading?


In [66]:
# vocab_to_int dictionary
vocab = {} 

index = 0
for word, count in counts.items():
    vocab[word] = index
    index += 1

In [108]:
counts

In [107]:
vocab # 단어 사전에 인덱스 붙이기

In [70]:
embedding_dim = 300

embedding= np.zeros((len(vocab), embedding_dim), dtype=np.float32)
for word, i in vocab.items():
    if word in w2v:
        embedding[i] = w2v[word] 
    else:
        w2v[word] = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embedding[i] = w2v[word]

print(embedding.shape)

(56346, 300)


In [77]:
def embedding_index(text):
    idx= []
    for word in text.split():
        if word in vocab:
            idx.append(vocab[word])
        else:
            idx.append(vocab["<UNK>"])
    return idx

train["question1_idx"] = list(map(embedding_index, train["question1"]))
train["question2_idx"] = list(map(embedding_index, train["question2"]))

In [102]:
train["question1_idx"][:5]
print(len(vocab))

56346


In [105]:
from keras.preprocessing import sequence

Question1 = sequence.pad_sequences(train["question1_idx"], maxlen=10, value=len(vocab)+1, padding="post")
Question2 = sequence.pad_sequences(train["question1_idx"], maxlen=10, value=len(vocab)+1, padding="post")

In [106]:
Question1[0]

array([    0,     0,     1,     2,     3,     4,     5, 56347, 56347,
       56347], dtype=int32)