In [None]:
import os
from LAC import LAC
lac = LAC(mode='seg')
import re
import shutil
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [None]:
# loading file paths
fp_train = 'Fudan/train'
fp_test ='Fudan/answer'
fp_target_63 ='targetdata/year1963'
fp_target_90 ='targetdata/year1990'

Step One: construct functions needed for data preprocessing

In [None]:
# define needed functions

# This function will delete the characters of '【' , '】', and '\n' in the Chinese text
# the input is a list of lists
def clean_text(file_readlines):
    _special_symbles = {ord('【'): None, ord('】'): None, ord('\n'): None, ord('＊'):None}
    _temp = [i.translate(_special_symbles) for i in file_readlines]
    return [i.strip() for i in _temp]


# this function will segment one single Chinese document and same it as one string in a list
def seg_chinese(fp):

    try:
        with open(fp, 'r') as fh:
            _text = fh.readlines()

            _text = lac.run(clean_text(_text))
            result = ' '.join([e for i in _text for e in i])
    except IOError:
        result = fp

    return result

#  this funciton will produce a dictionary with file path as keys, tag as values
def dir_tag_pair(directory):

    files = os.listdir(directory)
    return {os.path.join(directory,i):re.split('-',i)[1]
            for i in files if i.startswith('C')}


# the function will take the output of function dir_tag_pair as input, and produce two
# lists. One is a list of strings, each is one segmented Chinese document. Anther is a
# list of corresponding tags.
def process_sub_directory(directory, tag):

    files= os.listdir(directory)

    texts = []
    tags = []

    for file in files:
        if file.endswith('txt'):
            texts.append(seg_chinese(os.path.join(directory,file)))
            tags.append(tag)

    return texts, tags

# This function will output two lists, one consists of a list of documents, another of all tags.
# the input is the file path. The output are intended for the inputs of CounterVecterizer
def process_directory(directory):

    alltextdata = []
    alltag=[]

    _tempd =dir_tag_pair(directory)
    for k in _tempd.keys():
        _text, _tag = process_sub_directory(k,_tempd[k])
        alltextdata.extend(_text)
        alltag.extend(_tag)

    return alltextdata, alltag

Step Two Train the Model

In [None]:
#  preprocessing the train data
alltextdata, y_train =process_directory(fp_train)

In [None]:
# turn train data into vectors and use them to train the model
vectorizer = CountVectorizer()
model = MultinomialNB()
x_train = vectorizer.fit_transform(alltextdata)
model.fit(x_train, y_train)

Step Three 1) test the model

In [None]:
# process the test data
test_alltextdata, y_test = process_directory(fp_test)

In [None]:
# turn the test data into vectors and use them to test the model
x_text = vectorizer.transform(test_alltextdata)
y_hat = model.predict(x_text)

Step Three 2) report metrics

In [None]:
print('accuracy:', metrics.accuracy_score(y_test, y_hat))

In [None]:
print(metrics.classification_report(y_test, y_hat))

Step Four Manually Accessing the External Validity of the Model

In [None]:
# Define functions needed to process the target data(People's Daily Newspaper text in 1963 and 1990)
def processing_target_data(fp):
    _result = []
    for i in os.listdir(fp):
        if i.endswith('md'):
            with open(os.path.join(fp,i)) as fh:
                raw_a = fh.readlines()

                article = [sent.strip() for sent in raw_a[6:]]
                article = [e.replace('\u3000', '，') for e in article]
                pair = (raw_a[0], article)
                _result.append(pair)
    return _result

def segment_target_data(the_tuple):
    _result =[]
    for i in the_tuple:
        _text= lac.run(i[1])
        segmented_text=' '.join([item for l in _text for item in l])

        pair = (i[0], [segmented_text])
        _result.append(pair)

    return pd.DataFrame.from_records(_result, columns=['title','text'])


def predict_label(x):
    x_train = vectorizer.transform(x)
    return model.predict(x_train)[0]

In [None]:
# preprocessing the target data and apply the model on the data to predict labels
# two dataframes will be produced, corresponding to 1963 dataset and 1990 dataset respectively.
# The dataframe has three columns, corresponding to the title, document text, and predicted label.
target_63 = processing_target_data(fp_target_63)
df = segment_target_data(target_63)
df63['predicted_label'] = df63['text'].apply(lambda x: predict_label(x))
target_90 = processing_target_data(fp_target_90)
df90 = segment_target_data(target_90)
df90['predicted_label'] = df90['text'].apply(lambda x: predict_label(x))

Random selecting 30 samples from each dataset.

In [None]:
df63_sample = df63.iloc[np.random.choice(len(df63), 30)]
df90_sample = df90.iloc[np.random.choice(len(df90), 30)]
# I forgot to set the seed... np.random.seed()

save the samples into two csv files

In [None]:
df63_sample.to_csv('df63_sample.csv')
df90_sample.to_csv('df90_sample.csv')

Manually inspect the samples and assign a human determined label to each sample, and save them into two dataframes. The two new dataframes have same four column names: title, text, predicted label, true label

In [None]:
predicted_labels_63 = pd.read_csv('df63_sample_tcsv.csv', index_col =0)['predicted_label'].tolist()
true_labels_63 =pd.read_csv('df63_sample_tcsv.csv', index_col =0)['true_label'].tolist()
predicted_labels_90 = pd.read_csv('df90_sample_tcsv.csv', index_col =0)['predicted_label'].tolist()
true_labels_90 =pd.read_csv('df90_sample_tcsv.csv', index_col =0)['True_label'].tolist()

compare the model predicted labels and human determined ones, and print out the accuracy metrics.

In [None]:
print(metrics.accuracy_score(true_labels_63, predicted_labels_63))
print(metrics.classification_report(true_labels_63, predicted_labels_63))

In [None]:
print(metrics.accuracy_score(true_labels_90, predicted_labels_90))
print(metrics.classification_report(true_labels_90, predicted_labels_90))