In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
import os,json

In [3]:
import sys
sys.path.append('/workspace/external-libraries/')
import jieba

In [32]:
annotation_root = './raw_data/新闻描述二期第一批数据/'

In [33]:
annotation_file = [file for file in os.listdir(annotation_root) if file[-4:] == '.txt']

In [34]:
annotation_file

['MBM0W5J4M8XP_cm新闻标注二期—游行.txt',
 'V2X4HARUIC28_cm新闻标注二期—火灾.txt',
 'W60JBBZY0433_cm新闻标注二期—地震.txt',
 '7CBDH771T2XH_cm新闻标注二期—空难.txt',
 'NY5SRX7Y3THH_cm新闻标注二期—暴乱.txt']

data = dict_keys(['response', 'metadata'])

In [35]:
def bulid_data(data_root,filename):
    entity = {}
    sents = []
    sents_tokenize = []
    with open(os.path.join(data_root,filename),'r') as f:
        data = json.load(f)
    for data_part in data['response']['annotations']:
        if len(data_part['attributes']) > 5:
            seg_list = jieba.cut(data_part['attributes'].strip().replace(u'。',''),cut_all = False)
            sents_tokenize.append(list(seg_list))
            sents.append(data_part['attributes'])
    img_name = annotation.split('_')[0]
    if sents != []:
        entity['image_name'] = img_name + '.jpg'
        entity['sents'] = sents
        entity['sents_token'] = sents_tokenize
        return entity
    else:
        return {}

In [36]:
# encoding=utf-8
data_annotations = []
for files in annotation_file:
    for annotation in os.listdir(os.path.join(annotation_root,files)):
        entity = bulid_data(os.path.join(annotation_root,files),annotation)
        #print(image_name,label)
        if entity:
            data_annotations.append(entity)

In [37]:
len(data_annotations)

5123

In [38]:
def bulid_vocab(imgs):
    param = {}
    counts = {}
    for img in imgs:
        #print(img['sents_token'])
        for sent in img['sents_token']:
            for w in sent:
                counts[w] = counts.get(w,0) + 1
    cw = sorted([(count,w) for w,count in counts.items()],reverse=True)
    print('top words and their counts:')
    print('\n'.join(map(str,cw[:100])))
    
    #print(cw)
    total_words = sum(counts.values())
    print('total words:', total_words)
    bad_words = [w for w,n in counts.items() if n <= 1]
    vocab = [w for w,n in counts.items() if n > 1]
    bad_count = sum(counts[w] for w in bad_words)
    print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
    print('number of words in vocab would be %d' % (len(vocab), ))
    print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))
    sent_length = {}
    for img in imgs:
        for sent in img['sents_token']:
            nw = len(sent)
            sent_length[nw] = sent_length.get(nw,0) +1
    max_len = max(sent_length.keys())
    param['max_length'] = max_len
    print('max length sentence in raw data: ', max_len)
    print('sentence length distribution (count, number of words):')
    sum_len = sum(sent_length.values())
    for i in range(max_len+1):
        print('%2d: %10d   %f%%' % (i, sent_length.get(i,0), sent_length.get(i,0)*100.0/sum_len))
        

    if bad_count > 0:
        # additional special UNK token we will use below to map infrequent words to
        print('inserting the special UNK token')
        vocab.append(u'UNK')
    #print(vocab)
    for img in imgs:
        img['final_captions'] = []
        for sent in img['sents_token']:
            caption = [w if counts.get(w,0) > 1 else u'UNK' for w in sent]
            img['final_captions'].append(caption)
    return vocab,param
            

In [39]:
vocab,param = bulid_vocab(data_annotations)

top words and their counts:
(12816, '，')
(8200, '发生')
(6882, '某地')
(3788, '现场')
(3126, '某')
(3003, '火灾')
(2708, '在')
(2327, '地震')
(2251, '的')
(2132, '游行')
(1941, '地区')
(1693, '暴乱')
(1683, '救援')
(1595, '群众')
(1563, '正在')
(1164, '建筑')
(1066, '浓烟')
(1060, '抗议')
(983, '被')
(967, '警察')
(952, '进行')
(825, '严重')
(804, '举行')
(744, '房屋')
(732, '一名')
(731, '人员')
(714, '事件')
(685, '街头')
(684, '工作')
(682, '消防员')
(681, '多名')
(655, '上')
(632, '废墟')
(617, '大火')
(596, '着')
(571, '事故现场')
(549, '大量')
(532, '活动')
(514, '了')
(478, '失火')
(477, '滚滚')
(463, '抗议者')
(438, '汽车')
(433, '手持')
(427, '节日')
(413, '有')
(390, '聚集')
(387, '一')
(362, '建筑物')
(361, '示威者')
(361, '后')
(347, '飞机')
(337, '高举')
(337, '骚乱')
(331, '袭击')
(325, '男子')
(324, '旗帜')
(314, '火势')
(313, '围观')
(309, '损坏')
(305, '众多')
(303, '残骸')
(302, '遭受')
(301, '示威游行')
(300, '示威')
(293, '一片')
(278, '受损')
(274, '从')
(274, '上空')
(273, '森林')
(266, '空难')
(253, '民众')
(252, '标语牌')
(248, '中')
(246, '展开')
(244, '烧毁')
(244, '一辆')
(243, '凶猛')
(240, '庆祝')
(238, '吞噬

In [251]:
for img in data_annotations:
    n = len(img['final_captions'])
    if n==0:
        print(img)

In [252]:
itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table

In [253]:
import numpy as np

In [258]:
def encode_captions(imgs, params, wtoi):
    max_length = params['max_length']
    N = len(imgs)
    M = sum(len(img['final_captions']) for img in imgs) # total number of captions
    print(max_length,N,M)
    label_arrays = []
    label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
    label_end_ix = np.zeros(N, dtype='uint32')
    label_length = np.zeros(M, dtype='uint32')
    caption_counter = 0
    counter = 1
    for i,img in enumerate(imgs):
        n = len(img['final_captions'])
        assert n > 0, 'error: some image has no captions'
        Li = np.zeros((n, max_length), dtype='uint32')
        for j,s in enumerate(img['final_captions']):
            label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence
            caption_counter += 1
            for k,w in enumerate(s):
                if k < max_length:
                    Li[j,k] = wtoi[w]
        label_arrays.append(Li)
        label_start_ix[i] = counter
        label_end_ix[i] = counter + n - 1

        counter += n
    L = np.concatenate(label_arrays, axis=0) # put all the labels together
    assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
    assert np.all(label_length > 0), 'error: some caption had no words?'

    print('encoded captions to array of size ', L.shape)
    return L, label_start_ix, label_end_ix, label_length

In [259]:
L,label_start_ix, label_end_ix,label_length = encode_captions(data_annotations,param,wtoi)

27 5123 10259
encoded captions to array of size  (10259, 27)


In [260]:
label_start_ix

array([    1,     3,     5, ..., 10254, 10256, 10258], dtype=uint32)

In [261]:
label_end_ix

array([    2,     4,     6, ..., 10255, 10257, 10259], dtype=uint32)

In [262]:
label_length

array([16, 12, 25, ..., 13,  9, 14], dtype=uint32)

In [263]:
L

array([[   1,    2,    3, ...,    0,    0,    0],
       [   1,    2,    3, ...,    0,    0,    0],
       [   1,   20,   21, ...,   22,    0,    0],
       ...,
       [ 171,  858,    2, ...,    0,    0,    0],
       [   1,    2, 1776, ...,    0,    0,    0],
       [ 171,  275,    2, ...,    0,    0,    0]], dtype=uint32)

In [264]:
import h5py

In [265]:
f_lb = h5py.File('news_dataset_label.h5', "w")
f_lb.create_dataset("labels", dtype='uint32', data=L)
f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
f_lb.close()

In [283]:
import cv2
image_root = './images_1'
for img in data_annotations:
    img['image_path'] = image_root +'/'+ img['image_name']

for img in data_annotations:
    image = cv2.imread(img['image_path'])
    print(image)
    break

[[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [ 15  24  21]
  [ 23  27  28]
  [ 22  26  27]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [ 19  28  25]
  [ 19  25  24]
  [ 23  27  28]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [ 23  32  29]
  [ 29  35  34]
  [ 31  37  36]]

 ...

 [[ 70  73  77]
  [ 65  68  72]
  [ 65  68  72]
  ...
  [ 62  62  62]
  [ 59  59  59]
  [ 58  58  58]]

 [[ 66  69  73]
  [ 61  64  68]
  [ 62  65  69]
  ...
  [ 65  65  65]
  [ 63  63  63]
  [ 62  62  62]]

 [[ 76  80  81]
  [ 72  76  77]
  [ 71  75  76]
  ...
  [ 69  69  69]
  [ 67  67  67]
  [ 66  66  66]]]


In [5]:
import shutil,os
path = './raw_data/一万张第二批数据/'
new_path = './raw_data/images_2'
count = 0
for root, dirs, files in os.walk(path):
    for i in range(len(files)):
        #print(files[i])
        if (files[i][-3:] == 'jpg') or (files[i][-3:] == 'png') or (files[i][-3:] == 'JPG')or (files[i][-4:] == 'jpeg'):
            file_path = root+'/'+files[i]  
            new_file_path = new_path+ '/'+ files[i]  
            count += 1
            shutil.copy(file_path,new_file_path)
        else:
            print(files[i])
count

龙卷风.xlsx
游行.xlsx
空难.xlsx
矿难.xlsx
洪水.xlsx
泥石流.xlsx
火灾.xlsx
海啸.xlsx
006306.gif
山体滑坡.xlsx
坍塌.xlsx
交通事故.xlsx
暴乱.xlsx
爆炸.xlsx


5085