In [1]:
import pandas as pd

# Preparing data

In [5]:
fname = './news_chinese.tsv'
content = pd.read_csv(fname, encoding='utf-8', error_bad_lines=False, sep = '\t')

## Create new feature & appointing X and y

In [56]:
# weibo has title only, and we want those text included, so concatenate:
# Create is_xinhua as target
content['title_content'] = content.title.astype(str) + content.content.astype(str)
content['is_xinhua'] = content['source'] == '新华社'

X = content['title_content']
y = content['is_xinhua'].map({True:1,False:0})

## Create train and validation set

In [62]:
import re
import jieba
def cleaning_txt(txt):
    txt = re.findall(r'\w+', re.sub(r'\\n','', txt))
    tmp = []
    for sent in txt:
        tmp.extend(jieba.cut(sent))
    return tmp

In [71]:
from tqdm import tqdm_notebook

clean_content = []
for i in tqdm_notebook(range(len(X))):
    try:
        clean_content.append(' '.join(cleaning_txt(X[i])))
    except:
        print('{0} \t {1}'.format(i, sys.exc_info()[0]))
X = clean_content

HBox(children=(IntProgress(value=0, max=90538), HTML(value='')))

In [76]:
len(clean_content) == len(y)

In [82]:
from sklearn.model_selection import train_test_split

rand_seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                    random_state=rand_seed, stratify=y)

# Vectorize X_train

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Fit X_train with knn

In [133]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [134]:
predicts = knn.predict(X_test)

# Measuring predicts

In [137]:
from collections import Counter
def measure(y_true, predicts):
    tp = fp = tn = fn = 0.0
    for y, predict in zip(y_true, predicts):
        if y == predict:
            if y == 1:
                tp += 1
            else:
                tn += 1
        else:
            if y == 1:
                fn += 1
            else:
                fp += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 / (1 / precision + 1 / recall)
    return [precision, recall, f1]

In [138]:
precision, recall, f1 = measure(y_test, predicts)
print('precision is {0}\nrecall is {1}\nf1 is {2}'.format(precision, recall, f1))

precision is 0.9533767991142822
recall is 0.9852529875413171
f1 is 0.9690528290090653


# Grid search: n_neighbors

In [145]:
n_neighbors = range(2, 20, 4)
models_n = {}
predicts_n = {}
for n in n_neighbors:
    print('Training knn: n_neighbors = {}'.format(n))
    knn = KNeighborsClassifier(n_neighbors = n, n_jobs = 8)
    knn.fit(X_train,y_train)
    predicts = knn.predict(X_test)
    
    precision, recall, f1 = measure(y_test, predicts)
    print('precision is {0}\nrecall is {1}\nf1 is {2}'.format(precision, recall, f1))
    
    predicts_n[n] = predicts
    models_n[n] = knn

Training knn: n_neighbors = 2
precision is 0.9740945998195644
recall is 0.9608441393338418
f1 is 0.967424
Training knn: n_neighbors = 6
precision is 0.9590592334494773
recall is 0.9797864225781846
f1 is 0.9693120362218588
Training knn: n_neighbors = 10
precision is 0.9487870619946092
recall is 0.9844902110348335
f1 is 0.9663089593211879
Training knn: n_neighbors = 14
precision is 0.9436824857385605
recall is 0.9884312229849987
f1 is 0.9655386525923626
Training knn: n_neighbors = 18
precision is 0.9403237496979947
recall is 0.9895753877447241
f1 is 0.9643211100099107


In [147]:
# select best prediction according to f1 score
best_pred = predicts_n[6]

# Detect plagiarism

In [191]:
best_pred = pd.Series(best_pred, index = y_test.index)
content_test = content.loc[y_test.index]
suspects = content_test[[a == 1 and  b == False for a, b in zip(best_pred, content_test.is_xinhua)]]

In [194]:
suspects.head()

Unnamed: 0,id,author,source,content,feature,title,url,is_xinhua,title_content
83081,82947,,广州日报第A17版,借离婚转移财产躲债 妻被判和夫一同还债\r\n 广州日报讯?（全媒体记者章程?通讯员徐冰琪...,"{\type\"":\""身边纸·拍案\"",\""site\"":\""广州日报\"",\""commen...",借离婚转移财产躲债妻被判和夫一同还债,http://gzdaily.dayoo.com/html/20,False,借离婚转移财产躲债妻被判和夫一同还债借离婚转移财产躲债 妻被判和夫一同还债\r\n 广州日...
88823,88170,,中国新闻网,点击图片进入下一页\r\n发布会现场。企业供图\r\n中新网北京6月9日电 8日，摩根数字宣...,"{\type\"":\""IT业界\"",\""site\"":\""参考消息\"",\""commentN...",摩根数字发布智能手环：珠宝设计风格 支持在线支付等功能,http://www.cankaoxiaoxi.com/scie,False,摩根数字发布智能手环：珠宝设计风格 支持在线支付等功能点击图片进入下一页\r\n发布会现场。...
81668,81586,,海南日报第001版,为全面提升我省电网本质安全水平，实现电网架构和保供抗灾能力“脱胎换骨”的目标，大幅增强供...,"{\type\"":\""头版\"",\""site\"":\""海南日报\"",\""commentNum...",审议通过《提升海南电网供电保障 和抗灾能力三年行动计划》等文件,http://hnrb.hinews.cn/html/2017-,False,审议通过《提升海南电网供电保障 和抗灾能力三年行动计划》等文件 为全面提升我省电网本质安全...
84734,84502,,广州日报第FSA26版,“共享空调” 无标可依\r\n 共享汽车、共享单车、共享雨伞、共享……共享生活方式在今年越...,"{\type\"":\""家生活·E生活\"",\""site\"":\""广州日报\"",\""comme...",“共享空调” 无标可依,http://gzdaily.dayoo.com/html/20,False,“共享空调” 无标可依“共享空调” 无标可依\r\n 共享汽车、共享单车、共享雨伞、共享…...
85539,85176,,央视网,央视网消息：据财政部网站消息，6月，又到了一年一度全国人大常委会审议中央决算报告和中央决算草...,"{\type\"":\""时事要闻\"",\""site\"":\""参考消息\"",\""commentN...",中央部门决算：中央各部门的年度收支账本,http://www.cankaoxiaoxi.com/chin,False,中央部门决算：中央各部门的年度收支账本央视网消息：据财政部网站消息，6月，又到了一年一度全国...
