## 模块导入

In [None]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import os
import random
import re
import datetime
import copy
import shutil

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer
from textblob import TextBlob

import gensim
# import tensorflow as tf

from IPython.display import display
%matplotlib inline

print("\nImport modules successfully\n")

## 函数导入

In [None]:
# 由 %%capture captObj 配合 %%time 所获得的时间 captObj 中抽取所需的时间参数

def getCellTime_Wall(captObj):
    """
    Extract h, min, sec, ms from captObj gotton by %%capture and %%time
    ----------------
    Arguments: 
    
        captObj  `IPython.utils.capture.CapturedIO`, contains string captured using magic command %%capature
        
    ----------------
    Returns:
    
        rectime  a dictionary, containing time extracted from captObj
    """
    # Ref
    # https://docs.python.org/3/library/re.html
    # https://docs.python.org/3/howto/regex.html#regex-howto

    m = re.search('(?<=Wall time: ).*[^\n]', captObj.stdout)
    mstr = m.group(0)
    rectime = {}
    rectime['h'] = re.search('(\d+\.?\d*)(?= ?h)', mstr)
    rectime['min'] = re.search('(\d+\.?\d*)(?= ?min)', mstr)
    rectime['sec'] = re.search('(\d+\.?\d*)(?= ?s)', mstr)
    rectime['ms'] = re.search('(\d+\.?\d*)(?= ?ms)', mstr)
    rectime['us'] = re.search('(\d+\.?\d*)(?= ?us)', mstr)

    for ptn in ['h', 'min', 'sec', 'ms', 'us']:
        rectime[ptn] = 0 if rectime[ptn] is None else float(rectime[ptn].group(0))
        
    print(rectime)
    print('Record time successfully.')
    
    return rectime

In [None]:
def timeInFormat(timedict):
    t_h, t_min, t_s, t_ms, t_us = timedict['h'], timedict['min'], timedict['sec'], timedict['ms'], timedict['us']
    return str(datetime.timedelta(seconds=t_s, microseconds=t_us, milliseconds=t_ms, minutes=t_min, hours=t_h))

In [None]:
from textblob import Word
from textblob import Blobber
from textblob import TextBlob
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_header, strip_newsgroup_footer, strip_newsgroup_quoting

num2word = {'0':'zero', '1':'one', '2':'two', '3':'three', '4':'four',
            '5':'five', '6':'six', '7':'seven', '8':'eight', '9':'night'}

def dealPunctNumLemma(text):
    lowerText = text.lower()
    tb = Blobber()
    blob_01 = tb(lowerText)
    
    processed = ""
    for sentence in blob_01.sentences:
        #remove punctuation
        nopunct_string = ""
        newtext = sentence.raw
        newtext = re.sub(r'[^a-zA-Z0-9_]', r' ', newtext) #remove punctuation
        newtext = re.sub(r'\s{2,}', r' ', newtext) #remove continuous spaces
        newtext = ' '.join([i for i in newtext.split() if not((1 == len(i)) and i.isalpha() and ('a' != i))]) #remove single alpha(nonsense)
        newtext = ' '.join([' '.join([num2word[i] for i in iStr]) if iStr.isnumeric() else iStr 
                            for iStr in newtext.split()]) # represent digit in word form
        nopunct_string = newtext
        
        # remove the period signal
        if (len(nopunct_string) >= 1) and ('.' == nopunct_string[-1]):
            nopunct_string = nopunct_string[:-1]
        nopunct_sentence = TextBlob(nopunct_string)
        
        #lemmatization
        vocabs, tags = [], []
        for ivocab, itag in nopunct_sentence.tags:
            vocabs.append(Word(ivocab))
            tags.append(itag)
        newStr = ""
        for ivocab, itag in zip(vocabs, tags):
            try:
                newStr += ivocab.lemmatize(itag[0].lower())
            except:
                newStr += ivocab.lemmatize()
            newStr += ' '
        processed += "{}\n".format(newStr)
        
    return processed

_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
                       r'|^In article|^Quoted from)') #|^\||^>)')

def custom_quoting(rawtext):
    text = re.sub(r'(^\||^>)', ' ', rawtext)
    good_lines = [line for line in text.split('\n')
                  if not _QUOTE_RE.search(line)]
    return '\n'.join(good_lines)

def cleanText(rawstring):
    textHead, _blankline, textBody = rawstring.partition('\n\n') # ref: strip_newsgroup_header
    textCore = custom_quoting(textBody) # ref: strip_newsgroup_quoting
    lowerCoreList = textCore.split('\n')
    lowerCore = " ".join(lowerCoreList)

    heads = [ihead for ihead in textHead.split('\n') if not ihead.startswith('Lines:')]
    fileStr = "! ".join(heads)
    fileStr = dealPunctNumLemma(fileStr)
    fileStr += '\n'
    fileStr += dealPunctNumLemma(lowerCore)
        
    return fileStr

## 初始化操作

In [None]:
# 初始化即将用于记录数据的数据结构

idpart = {}
record_data = {}
record_labels = ['tool.word', 'useStopwords', 'classifier', 'micro-F1'] #'accuracy', 'macro-F1', 
record_labels += ['time.{}.{}'.format(labelpart, ilabel) for labelpart in ['format', 'raw'] for ilabel in ['train', 'evaluate', 'all']]
for record_label in record_labels:
    record_data[record_label] = {}
    
print('Create:\nidpart:{} \n record_data:{}\n record_labels:{}'.format(idpart, record_data, record_labels))
print('\nData structure for recording experiment data created.\n')

In [None]:
# 文本清洗

# vecsize: vector dimension
# casesize: how many groups will be used in the classification
vecsize = 500
casesize = 20

# paths: dict of some important path
paths = {}
paths['dir.dataroot'] =  os.path.join(os.getcwd(), '..', 'data')
paths['dir.train'] = os.path.join(paths['dir.dataroot'], 'trialdata', 'train')
paths['dir.test'] = os.path.join(paths['dir.dataroot'], 'trialdata', 'test')
        
# if source data dosen't exist in target directory, copy them from the source data directory
existedFlag = os.path.join(paths['dir.dataroot'], 'existedFlag')
if not os.path.isfile(existedFlag):
    paths['dir.src_dataroot'] =  os.path.join(paths['dir.dataroot'], 'srcdata', '20news-bydate')
    paths['dir.src_train'] =  os.path.join(paths['dir.src_dataroot'], '20news-bydate-train')
    paths['dir.src_test'] =  os.path.join(paths['dir.src_dataroot'], '20news-bydate-test')
    dirs_all = os.listdir(paths['dir.src_train'])
    dirs_randInds = random.sample(range(len(dirs_all)), casesize)
    dirs_rands = [dirs_all[i] for i in dirs_randInds]
    for tpart in ['train', 'test']:
        for dname in dirs_rands:
            dpath = os.path.join(paths['dir.src_{}'.format(tpart)], dname)
            shutil.copytree(dpath,  os.path.join(paths['dir.{}'.format(tpart)], dname))
    os.system('touch {}'.format(existedFlag))

print("Pick random folders from source successfully")

preprocessedFlag = os.path.join(paths['dir.dataroot'], 'preprocessed')
if not os.path.isfile(preprocessedFlag):
    for tpart in ['train', 'test']:
        dirpath = paths['dir.{}'.format(tpart)]
        for cls in os.listdir(dirpath):
            clspath = os.path.join(dirpath, cls)
            files = os.listdir(clspath)
            for eachfile in files:
                fpath = os.path.join(clspath, eachfile)
                with open(fpath, 'r', encoding="latin-1") as f:
                    fcontent = f.read()
                with open(fpath, 'w') as f:
                    newcontent = cleanText(fcontent)
                    f.write(newcontent)
    os.system('touch {}'.format(preprocessedFlag))
                