## Imports

In [None]:
#coding=utf-8
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
from nltk.corpus import stopwords
import json
import numpy as np
from sklearn.cluster import KMeans
from scipy import sparse
import multiprocessing as mlp
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.metrics import calinski_harabaz_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

#todo: 还有其他词干抽取器
from nltk.stem.porter import PorterStemmer

assignments_train_path = './data/assignment_train.json'
pubs_train_path = './data/pubs_train.json'
pubs_validate_path = './data/pubs_validate.json'

In [None]:
def model_select(X):
    opt_score = 0
    opt_k = 200
    for k in range(200, 401, 10):
        m = KMeans(n_clusters=k)
        y_pred = m.fit_predict(X)
        t = calinski_harabaz_score(X, y_pred)
        if t>opt_score:
            opt_score = t
            opt_k = k
    return opt_k  

In [None]:
def tfidf_fea(docs):
    if sum([len(doc) for doc in docs])==0:
        return np.zeros((len(docs),1))
    v = CountVectorizer()
    t = TfidfTransformer()
    return t.fit_transform(v.fit_transform(docs))

def is_same_name(s1, s2):
    return re.sub('[^a-z]', '', s1.lower())==re.sub('[^a-z]', '', s2.lower())

def construct_docs4aut(pubs, author, field):
    papers = pubs[author]
    if field=='org':
        #lower_docs = [ aut['org'].lower() for p in papers for aut in p['authors'] if is_same_name(aut['name'], author) ]
        
        lower_docs=[]
        for p in papers:
            for aut in p['authors']:
                if is_same_name(aut['name'], author):
                    lower_docs.append(aut['org'].lower())
                    #即使是一篇文章，作者列表也可能存在重名的情况，这时，仅取第一个。
                    break
    elif field=='keywords':
        lower_docs = [' '.join(p[field]).lower()if field in p and p[field]!=None else '' for p in papers]
    else:
        lower_docs = [p[field].lower() if field in p and p[field]!=None else '' for p in papers ]
    #仅保留字母、下划线、空格
    docs = [re.sub('[^ _a-z]', '', s) for s in lower_docs]
    
    #空格分词
    splt = [re.split(' +', s) for s in docs]
    
    #去停用词
    stop_words_set = set(stopwords.words('english'))
    splt = [[s for s in ss if len(s)>0 and s not in stop_words_set] for ss in splt]
    
    #抽取词干、合并分词结果
    stemer = PorterStemmer()
    stem_docs = [' '.join([stemer.stem(s) for s in ss])for ss in splt]
    return stem_docs
    
def cluster4aut(author):
    papers = pubs[author]
    year_fea = np.array([p['year'] if 'year' in p else 0 for p in papers]).reshape((-1,1))
    ids = [p['id'] for p in papers]
    
    keywords = construct_docs4aut(pubs, author, 'keywords')
    keywords_fea = tfidf_fea(keywords)
    
    abstract = construct_docs4aut(pubs, author, 'abstract')
    abs_fea = tfidf_fea(abstract)
    
    title = construct_docs4aut(pubs, author, 'title')
    tit_fea = tfidf_fea(title)
    
    org = construct_docs4aut(pubs, author, 'org')
    org_fea = tfidf_fea(org)
    
    venue = construct_docs4aut(pubs, author, 'venue')
    venue_fea = tfidf_fea(venue)

    #print(keywords_fea.shape, abs_fea.shape, tit_fea.shape, org_fea.shape, venue_fea.shape, year_fea.shape)
    all_fea = sparse.hstack([keywords_fea, abs_fea, tit_fea, org_fea, venue_fea, year_fea])
    
    sca = MinMaxScaler((0,1))
    X = sca.fit_transform(all_fea.toarray())
    
    N_cluster = model_select(X)
    m = KMeans(n_clusters = N_cluster)
    m.fit(X)
    
    res = {}
    for i in range(len(m.labels_)):
        res.setdefault(m.labels_[i], []).append(ids[i])
    
    return list(res.values())
    

In [None]:
assignments_train = json.load(open(assignments_train_path,'r'))
pubs_train = json.load(open(pubs_train_path, 'r'))
pubs_validate = json.load(open(pubs_validate_path,'r'))
pubs={**pubs_train, **pubs_validate}
assert(len(pubs)==len(pubs_train)+len(pubs_validate))
pool = mlp.Pool(20)

#### Train Results

In [None]:
train_results = pool.map(cluster4aut, list(pubs_train.keys()))
assert(len(pubs_train)==len(train_results))
json.dump(dict(zip( pubs_train.keys(), train_results )), open('assignment_train_result.json', 'w')  )

#### Validation Results

In [None]:
validate_results = pool.map(cluster4aut, list(pubs_validate.keys()))
assert(len(pubs_validate)==len(validate_results))
json.dump(dict(zip( pubs_validate.keys(), validate_results )), open('assignment_validate_result.json', 'w')  )