In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time 
import operator
%matplotlib inline

## 读入数据

In [2]:
#data_path='D:/AI.Data/Data/ieee_zhihu_cup/'
data_path = 'ieee_zhihu_cup/'
df_question_topic = pd.read_csv(data_path+'question_topic_train_set.txt', header=None, names=['question_id', 'topic_id'],sep='\t')
df_topics = pd.read_csv(data_path+'topic_info.txt', header=None, names=['topic_id', 'pid', 'cn', 'wn', 'cd', 'wd'],sep='\t')


In [3]:
def split2list(row):
    return set(map(int, row.split(',')))

df_question_topic['topic_set'] = df_question_topic.topic_id.apply(split2list)

topic_num = df_topics.shape[0]

In [4]:
df_question_topic.head(5)

Unnamed: 0,question_id,topic_id,topic_set
0,6555699376639805223,77390041956937749753738968195649774859,"{7739004195693774975, 3738968195649774859}"
1,2887834264226772863,-3149765934180654494,{-3149765934180654494}
2,-2687466858632038806,-760432988437306018,{-760432988437306018}
3,-5698296155734268,-67589421411221139073195914392210930723,"{-6758942141122113907, 3195914392210930723}"
4,-6719100304248915192,"3804601920633030746,4797226510592237555,435133...","{-2689200710357900655, 4351331710881888756, 38..."


### 统计每个标签的使用次数，并排序，然后按降序画出统计图
横坐标为顺序号，纵坐标为使用的次数

In [None]:
def enum_topics(row):
    for topic in row:
        topics[topic] = topics[topic] + 1

In [None]:
topics = df_topics['topic_id'].tolist()

topics=dict(zip(topics, np.zeros(len(topics), dtype=np.int)))

df_question_topic['topic_set'].apply(enum_topics);

print('Test value it should be 5955, topics[738845194850773558]=', topics[738845194850773558])


In [None]:
plt.figure(figsize=(7,4.5))

plt.plot(range(0,topic_num), list(topics.values()), linewidth=2)

plt.xlabel('Topic number')
plt.ylabel('# of topic is assigned (times)')
plt.rcParams.update({'font.size':16})

plt.tight_layout()

In [None]:
plt.figure(figsize=(7,4.5))
plt.hist(list(topics.values()), bins=20)
plt.xlabel('Topic used')
plt.rcParams.update({'font.size':16})
plt.tight_layout()

In [None]:
plt.figure(figsize=(7,4.5))
plt.hist(list(topics.values()), bins=20)
plt.xlabel('Topic used')
plt.ylabel('# Log')
plt.rcParams.update({'font.size':16})
plt.yscale('log', nonposy='clip')
plt.tight_layout()

In [None]:
frecurrently_used_topics = [t for t in list(topics.values()) if t > 2000 and t < 10000]

In [None]:
plt.figure(figsize=(7,4.5))
plt.hist(frecurrently_used_topics, bins=20)
plt.xlabel('Topic used')
plt.ylabel('# of topic is assigned (times)')
plt.rcParams.update({'font.size':16})

plt.tight_layout()

### 根据上图列出使用最多的前 n 个标签，至于 n 应该是几要看图来决定。

In [None]:
# 我为什么选择 n = ? 你的说明放在此处
# your code here ...

#列出被使用超过20000次的标签
topics_used_more_than_10000 = dict((k, v) for k, v in topics.items() if v >= 10000)
print("There are ", len(topics_used_more_than_10000), "topics are used more than 10000 times.")

### 统计文章的标签数，并画统计图
横坐标为标签数，纵坐标为含有此标签数的文章的数量

In [None]:
# your code here ...
df_question_topic['topic_set_count']=df_question_topic['topic_set'].apply(len)

plt.figure(figsize=(7,4.5))
plt.hist(df_question_topic['topic_set_count'], bins=20)
plt.ylabel('# of questions')
plt.xlabel('# of topics assigned to a question')
plt.rcParams.update({'font.size':16})
plt.tight_layout()

由上图可知，一个问题通常会有1-5个标签，最好多的是一个标签的question，随着标签数量的上升，文章的数量也快速下降。很少（需要计算多少，占百分比）文章有5个以的标签

### 简要分析上图

根据上图我认为 ... 分析一张图雅思要多少字来着？
你的说明放在此处

### construct a set of (topic1, topic2) where topic1 != topic2

In [5]:
two_topic_set = []
iterable = df_topics['topic_id'].tolist()
for topic1 in iterable:
    for topic2 in iterable:
        if topic1 != topic2:
            s = frozenset([topic1, topic2])
            #s.add(topic1)
            #s.add(topic2)
            two_topic_set.append(s)


### construct a dict using frozenset as key

In [6]:
topics = df_topics['topic_id'].tolist()
topics=dict(zip(topics, np.zeros(len(topics), dtype=np.int)))
two_topic_dict = dict(zip(two_topic_set, np.zeros(len(two_topic_set), dtype=np.int)))

In [7]:
two_topic_dict

{frozenset({-372645249911808449, 2353421622418180700}): 0,
 frozenset({-8274522839089381384, -306626713584702071}): 0,
 frozenset({-744588636265106420, 564572820242783793}): 0,
 frozenset({-2604494515308175827, 4637223908993888877}): 0,
 frozenset({3471965724367244818, 3571603611933812906}): 0,
 frozenset({-146867728338383595, 1245968097376795823}): 0,
 frozenset({-5264509641502027935, 4287430850224527380}): 0,
 frozenset({-4921212371110555224, 6049758334588147235}): 0,
 frozenset({2919247920214845195, 7520467742563767493}): 0,
 frozenset({1840848880823843395, 8570540777066461619}): 0,
 frozenset({-9176307901497282391, 3972456192477785643}): 0,
 frozenset({-3744255770910909874, -1029971585579892097}): 0,
 frozenset({-823705168688534512, 1790630374868647951}): 0,
 frozenset({-1296438675909954008, 2872176636559084285}): 0,
 frozenset({-1980876636979981304, 546485900534542453}): 0,
 frozenset({42427966967759255, 637187908693888049}): 0,
 frozenset({-9193714286190364511, -89325460575428674

### convert set to frozenset in dataframe

In [8]:
def to_frozenset(row):
    return frozenset([i for i in row])
df_question_topic['topic_id_frozen_set'] = df_question_topic.topic_set.apply(to_frozenset)

### extract rows from df_question_topic where topic_set_count is 2

In [9]:
df_question_topic['topic_set_count']=df_question_topic['topic_set'].apply(len)
df_2_topics = df_question_topic[df_question_topic.topic_set_count == 2]

### count 2 topic set usage

In [10]:
def count_usage(row):
    two_topic_dict[row]+=1
    return

In [11]:
_=df_2_topics.topic_id_frozen_set.apply(count_usage)

In [12]:
two_topic_dict_sorted = sorted(two_topic_dict.items(), key=operator.itemgetter(1), reverse=True)

In [13]:
subdict = {k:v for k,v in two_topic_dict_sorted if v >=500}

In [14]:
subdict_sorted = sorted(subdict.items(), key=operator.itemgetter(1), reverse=True)
subdict_sorted

[(frozenset({569940021530854113, 2327504309227366054}), 1934),
 (frozenset({-1957590033086240723, 2168515087433636094}), 1721),
 (frozenset({-1957590033086240723, 2184408420667115619}), 1623),
 (frozenset({-8780119653771094858, 7237300102721734709}), 1105),
 (frozenset({3865463466240567990, 5706344191507825718}), 1037),
 (frozenset({-8109793286902118151, -4317515119936650885}), 888),
 (frozenset({-5461355180342040702, 2425886132400160849}), 844),
 (frozenset({-5243683122430849003, 4160758010753708150}), 779),
 (frozenset({4144904362303444936, 5370870069386720811}), 778),
 (frozenset({-4530635517516786435, 7473164687826110412}), 740),
 (frozenset({-6645253738893652511, -2945591676659457448}), 714),
 (frozenset({5727179283235569021, 7820378771178228515}), 707),
 (frozenset({-5528371788726186348, 7318907785137551256}), 704),
 (frozenset({-3895054941927964525, 5370870069386720811}), 692),
 (frozenset({-6666640731218555667, -5264509641502027935}), 687),
 (frozenset({-4422893742093299170, -4

### the following cells deal with all content, including questions with all numbers of topics

In [15]:
def split2two(a):
    s=[]
    for i in a:
        for j in a:
            if i != j and frozenset([i,j]) not in s:
                s.append(frozenset([i,j]))
    return s

In [17]:
def count_usage2(row):
    splited = split2two(row)
    for i in splited:
        two_topic_dict_copmlete[i]+=1
    return

In [18]:
two_topic_dict_copmlete = dict(zip(two_topic_set, np.zeros(len(two_topic_set), dtype=np.int)))

In [19]:
_ = df_question_topic.topic_set.apply(count_usage2)

In [22]:
two_topic_dict_copmlete_sorted = sorted(two_topic_dict_copmlete.items(), key=operator.itemgetter(1), reverse=True)
two_topic_dict_copmlete_sorted

[(frozenset({-744588636265106420, 564572820242783793}), 0),
 (frozenset({3471965724367244818, 3571603611933812906}), 0),
 (frozenset({-146867728338383595, 1245968097376795823}), 0),
 (frozenset({-5264509641502027935, 4287430850224527380}), 0),
 (frozenset({-4921212371110555224, 6049758334588147235}), 0),
 (frozenset({2919247920214845195, 7520467742563767493}), 0),
 (frozenset({1840848880823843395, 8570540777066461619}), 0),
 (frozenset({-823705168688534512, 1790630374868647951}), 0),
 (frozenset({-1296438675909954008, 2872176636559084285}), 0),
 (frozenset({-1980876636979981304, 546485900534542453}), 0),
 (frozenset({-9193714286190364511, -8932546057542867495}), 0),
 (frozenset({4689502080141399321, 7347583870493810379}), 0),
 (frozenset({1063815048865447049, 2131451573312950491}), 0),
 (frozenset({-7274130935521678793, -3151738353564633688}), 0),
 (frozenset({-2708575780282931506, -2627298052801704596}), 0),
 (frozenset({-4677755972317734407, -989115724037427557}), 0),
 (frozenset({-7

### 统计使用最多的双标签组合，列出前10还是前100？

In [None]:
# your code here ...

### 统计使用最多的三标签组合，列出前10还是前100？

In [None]:
# your code here ...

sorted_x = sorted(topics.items(), key=operator.itemgetter(1))

### 统计被使用文章最多的组合标签数, 并列出前 n 个
比如 某个 双topic 组合被使用 800 次，另一个三topic 组合被使用了780次，再有一个双topic组合被使用了778次 。。。 