In [18]:
import csv
import jieba
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [41]:
class Questions(object):
    def __init__(self, file):
        self.file = file
        
        with open('../data/{0}'.format(self.file)) as data_file:
            self.data = [(row['question'], row['type']) for row in csv.DictReader(data_file)]
        data_file.close()
    
    def filter_eng_words(self):
        filter_data = []
        for question in self.data:
            content = []
            segs = jieba.cut(question[0], cut_all=False)
            final = [seg for seg in segs if re.search('[a-zA-Z]', seg) == None]
            filter_data.append(' '.join(final))
            
        return filter_data
    
    def display_scores(self, tfidf_result, tfidf_features):
        # http://stackoverflow.com/questions/16078015/
        scores = zip(tfidf_features,
                     np.asarray(tfidf_result.sum(axis=0)).ravel())
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
        
        return sorted_scores
    
    def cal_tfidf(self, corpus):
        tfidf_vectorizer = TfidfVectorizer()
        tfidf = tfidf_vectorizer.fit_transform(corpus)
        tfidf_features = tfidf_vectorizer.get_feature_names()
        tfidf_scores = self.display_scores(tfidf, tfidf_features)
        
        return tfidf_scores

In [42]:
questions_data = Questions('questions.csv')

In [43]:
filter_eng_data = questions_data.filter_eng_words()

In [44]:
tfidf_sorted = questions_data.cal_tfidf(filter_eng_data)

In [45]:
tfidf_sorted

[('什麼', 911.66892592666466),
 ('請問', 654.77556568936802),
 ('這裡', 652.56387676055874),
 ('不是', 394.79116666047605),
 ('意思', 385.78076521252058),
 ('用法', 383.09952908845258),
 ('可以', 372.56391272809384),
 ('這邊', 307.15772743924549),
 ('謝謝', 302.75349806073467),
 ('這句', 293.73553770286372),
 ('為何', 293.61978091425755),
 ('發音', 246.40879479725285),
 ('使用', 241.47582897956613),
 ('文法', 236.68610768308275),
 ('省略', 227.87705676361122),
 ('動詞', 227.76551375121747),
 ('怎麼', 222.31086945838732),
 ('翻譯', 220.08943278875452),
 ('要加', 217.76236110106018),
 ('老師', 215.72236177349444),
 ('過去', 194.12611680213914),
 ('還是', 179.12229807394644),
 ('前面', 175.21445456774717),
 ('如何', 169.07377590097147),
 ('甚麼', 155.7504502141048),
 ('一下', 132.50621302489074),
 ('這個', 131.85734970525019),
 ('這樣', 127.38770759469281),
 ('句子', 127.27827137003035),
 ('完成', 126.43426428463449),
 ('是否', 123.87146937495562),
 ('問為', 120.85330054112889),
 ('因為', 120.08193982954873),
 ('何要', 118.80907615005502),
 ('不用', 118.676