In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [2]:
train = pd.read_csv('train.csv',encoding = "gbk")
print(train.shape)
test = pd.read_csv('test.csv',encoding = "gbk")
print(test.shape)

(49972, 4)
(25413, 4)


In [3]:
X_train = train.loc[:,['Headline','Body']].values
Y_train = train.Stance.values
X_test = test.loc[:,['Headline','Body']].values
Y_test = test.Stance.values
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(49972, 2)
(49972,)
(25413, 2)
(25413,)


In [4]:
def cos(vect1,vect2):  
    dot_product = 0.0;  
    norma = 0.0;  
    normb = 0.0;  
    for a,b in zip(vect1,vect2):  
        dot_product += a*b  
        norma += a**2  
        normb += b**2  
    if norma == 0.0 or norma==0.0:  
        return 0  
    else:  
        return dot_product / ((norma*normb)**0.5) 

In [5]:
def get_cos(X):
    stop_word = list(ENGLISH_STOP_WORDS)
    vectorizer=CountVectorizer()
    transformer=TfidfTransformer()
    Y_weight = []
    for row in range(len(X)):
        if row%5000 == 1:
            print('calculating Tf-Idf and Cosine similarity.... current:',row)
        for col in [0,1]:
            X[row][col] = ' '.join([word for word in X[row][col].split() if word not in stop_word])
        tfidf=transformer.fit_transform(vectorizer.fit_transform(X[row]))
        word=vectorizer.get_feature_names()
        weight=tfidf.toarray()
        Y_weight.append(cos(weight[0],weight[1]))
    print('completed!')
    return Y_weight

In [6]:
def predict(para,X):
    Y_weight = get_cos(X)
    Y_predict = []
    for i in range(len(Y_weight)):
        if Y_weight[i] > para:
            Y_predict.append('related')
        else:
            Y_predict.append('unrelated')
    return Y_predict

In [7]:
def fit(X_train):
    Y_train_weight = get_cos(X_train)
    best_w = 0
    best_accuracy = 0
    best_t = 0
    best_f = 0
    for w in np.arange(0.01,0.3,0.001):
        T = 0
        F = 0
        for i in range(len(Y_train_weight)):
            if Y_train_weight[i]>w and Y_train[i] != 'unrelated' or Y_train_weight[i]<w and Y_train[i] == 'unrelated':
                T += 1
            else:
                F += 1
        accuracy = T/(T+F)
        if accuracy>best_accuracy:
            best_w = w
            best_accuracy = accuracy
            best_t = T
            best_f = F
    return best_w

In [8]:
def accuracy(Y_predict, Y):
    TN = 0
    TP = 0
    FP = 0
    FN = 0
    for i in range(len(Y)):
        if Y[i] == 'unrelated' and Y_predict[i] == 'unrelated':
            TN += 1
        if Y[i] != 'unrelated' and Y_predict[i] == 'unrelated':
            FN += 1
        if Y[i] == 'unrelated' and Y_predict[i] != 'unrelated':
            FP += 1
        if Y[i] != 'unrelated' and Y_predict[i] != 'unrelated':
            TP += 1
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    accuracy = (TP+TN)/len(Y)
    print ('precision:',precision)
    print('recall:',recall)
    print('total accuracy:',accuracy)
    return precision,recall,accuracy

In [9]:
para = fit(X_train)

('calculating Tf-Idf and Cosine similarity.... current:', 1)
('calculating Tf-Idf and Cosine similarity.... current:', 5001)
('calculating Tf-Idf and Cosine similarity.... current:', 10001)
('calculating Tf-Idf and Cosine similarity.... current:', 15001)
('calculating Tf-Idf and Cosine similarity.... current:', 20001)
('calculating Tf-Idf and Cosine similarity.... current:', 25001)
('calculating Tf-Idf and Cosine similarity.... current:', 30001)
('calculating Tf-Idf and Cosine similarity.... current:', 35001)
('calculating Tf-Idf and Cosine similarity.... current:', 40001)
('calculating Tf-Idf and Cosine similarity.... current:', 45001)
completed!


In [12]:
print(para)
para

0


0

In [21]:
Y_test_predict = predict(para,X_test)

calculating Tf-Idf and Cosine similarity.... current: 1
calculating Tf-Idf and Cosine similarity.... current: 5001
calculating Tf-Idf and Cosine similarity.... current: 10001
calculating Tf-Idf and Cosine similarity.... current: 15001
calculating Tf-Idf and Cosine similarity.... current: 20001
calculating Tf-Idf and Cosine similarity.... current: 25001
completed!


In [22]:
accuracy(Y_test_predict,Y_test)

precision: 0.9351865840761444
recall: 0.8762740656851642
total accuracy: 0.9487270294731043


(0.9351865840761444, 0.8762740656851642, 0.9487270294731043)