# CNN데이터 전처리

In [None]:
import unicodedata
import re
import numpy as np
import pandas as pd
from os import listdir


# load all stories in a directory
def load_stories(directory):
    print('stories loading')
    stories = list()

    for name in listdir(directory):
        filename = directory + '/' + name
        #print(filename)   #-> C:/Users/Desktop/dataset/cnn_stories/cnn/stories/0001d1afc246a7964130f43ae940af6bc6c57f01.story
        
        # load document
        doc = load_doc(filename)
        # print(doc)  -> 기사 내용 출력!
        
        # split into story and highlights
        story, highlights = split_story(doc)
        #print(story, highlights)
        
        # store 방법 2가지 (1.dictionary / 2. list)
        stories.append({'story':story, 'highlights':highlights})
        #test_stories.append(story)
        #test_highlights.append(highlights)
    return stories


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

    
# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    #print('highlight시작하는 index', index)
    
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    story= story.replace("\n\n"," ").strip()         #story 개행문자 제거 후 한문장으로 만들기
    #print(len(highlights), highlights)               #highlights개수 + 1 (개행문자포함)
    
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]   #개행문자 제거!
    #print(len(highlights), highlights)

    # array형식으로 나눠져있는 문장데이터를 하나의 string으로 바꿈!
    highlights_string = '. '.join(highlights)
    highlights_string = highlights_string + '.'
    #print(highlights_string,'\n')
    return story, highlights_string
 

    
    
# clean a list of lines
def clean_lines_story(line):
    
    # 1. 일부러 앞문장에서만 찾고 --> 대부분 앞에 필요없는 데이터 있었음.(시간단축)
    # 2. story 두가지 방식으로 전처리함
    # 첫번째 '-- '   (다른단어) -- / (CNN) -- / 이름 or 지명 (CNN) -- 
    # 두번째 그냥 (CNN)
    index = line.find('-- ',0,50)     #line 0~50까지 '-- ' 검색
    index2 = line.find('(CNN)',0,50)  #line 0~50까지 CNN 검색
    
    # 검색결과 있을 때 그 위치의 index + (CNN) or '-- ' 길이만큼 이동
    if index > -1:       #'-- ' 검색
        line = line[index+len('-- '):] 
    elif index2 > -1:   #(CNN) 검색
        line = line[index2+len('(CNN)'):] 
    else :              # 검색결과 없을 땐 그대로 출력!
        pass
        
    return preprocess_sentence(line)
    
# clean a list of lines
def clean_lines_highlight(line):
    # 1. 일부러 앞문장에서만 찾고 --> 대부분 앞에 필요없는 데이터 있었음.(시간단축)
    # 2. highlights 전처리
    index3 = line.find('NEW:',0,10)        #line 0~10까지 NEW: 검색
    
    # 검색결과 있을 때 그 위치의 index + NEW: 길이만큼 이동
    if index3 > -1:       #NEW: 검색
        line = line[index3+len('NEW:'):]
    else :              # 검색결과 없을 땐 그대로 출력!
        pass
    
    return preprocess_sentence(line)
  
    
def preprocess_sentence(line):

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    line = re.sub(r"([?.!,¿])", r" \1 ", line)
    line = re.sub(r'[" "]+', " ", line)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    line = re.sub(r"[^a-zA-Z0-9?.!,@%$¿]+", " ", line)

    return line.strip().lower()
    
    
# load stories
#test_stories =[]
#test_highlights = []
directory = 'C:/Users/Jeyoung/Desktop/dataset/cnn_stories/cnn/stories'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))
#print('Loaded test_stories %d' %len(test_stories), '/ Loaded test_highlights %d' %len(test_highlights))


# clean stories (2가지 방법!!)
#1. dictionary 형태 
print('proprocessing')
for s in stories:
    s['story'] = clean_lines_story(s['story'])
    s['highlights'] = clean_lines_highlight(s['highlights'])    
    
# 2. list 형태
# for idx, value in enumerate(test_stories):
#     test_stories[idx] = clean_lines(value)
#     #print(value)

df = pd.DataFrame(data = stories)
print(df.describe())
df.to_csv('cnn_stories_all.csv', index=False)


# 중복값 체크1 (중복 제거 후 필요한만큼 데이터 저장)

In [2]:
duplicate_check = pd.read_csv('cnn_stories_all.csv').dropna()
print('중복체크 이전')
print(len(duplicate_check['story']))
print(len(duplicate_check['highlights']), '\n')

duplicate_check = duplicate_check.drop_duplicates(subset = 'story')
print('story 중복체크 후 데이터 갯수')
print(len(duplicate_check['story']))
print(len(duplicate_check['highlights']), '\n')

duplicate_check = duplicate_check.drop_duplicates(subset = 'highlights')
print('highlights 중복체크 후 데이터 갯수')
print(len(duplicate_check['story']))
print(len(duplicate_check['highlights']))

duplicate_check.to_csv('cnn_stories_all.csv', index = False)   

중복체크 이전
92465
92465 

story 중복체크 후 데이터 갯수
89360
89360 

highlights 중복체크 후 데이터 갯수
88640
88640


# CNN 데이터 랜덤하게 섞고 6만개 데이터 저장(train+vali)

In [3]:
cnn_articles = pd.read_csv('cnn_stories_all.csv').dropna()
print(len(cnn_articles['story']))   # 원래는 92579 --> nan 제거 후 92465  --> 중복값 제거 후 88640

# The frac keyword argument specifies the fraction of rows to return in the random sample, so frac=1 means return all rows (in random order).
# Here, specifying drop=True prevents .reset_index from creating a column containing the old index entries.
cnn_articles = cnn_articles.sample(frac=1).reset_index(drop=True)
cnn_articles = cnn_articles[:60000]
cnn_articles.to_csv('cnn_stories.csv', index=False)

88640


In [4]:
# null check
cnn_articles.isnull().any()

highlights    False
story         False
dtype: bool

# train, test 분류(5만, 1만)

In [5]:
cnn_articles = pd.read_csv('cnn_stories.csv')
cnn_articles_train = cnn_articles[:50000]
cnn_articles_train.to_csv('cnn_stories_train.csv', index=False)

cnn_articles_test = cnn_articles[50000:]
cnn_articles_test.to_csv('cnn_stories_test.csv', index=False)

print(len(cnn_articles_train['highlights']))
print(len(cnn_articles_test['story']))

50000
10000


In [None]:
print(cnn_articles_test.iloc[377])

# 길이구하기

In [None]:
story = pd.read_csv('cnn_stories_all.csv').dropna()

story_count_list = []
#for s in story['story']:
#    c = len(str(s).split(' '))
#    story_count_list.append(c)
#    if c == 1:
#        print(str(s).split(' '))
        
for s in story['story']:
    story_count_list.append(len(str(s).split(' ')))
print(min(story_count_list))
print(max(story_count_list))
print(sum(story_count_list)/len(story_count_list),'\n')

h_count_list = []
for h in story['highlights']:
    h_count_list.append(len(h.split(' ')))
print(min(h_count_list))
print(max(h_count_list))
print(sum(h_count_list)/len(h_count_list))

In [None]:
weird_index = []
for i, count in enumerate(story_count_list):
    if count == 1:
        print(i)
        weird_index.append(i)

for i, s in enumerate(list(story['story'])):
    if i in weird_index:
        print(s)


# Stopwords 제거

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jeyoung\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords 
import pandas as pd

#train = pd.read_csv('train_data.csv').dropna()
test = pd.read_csv('train_data.csv').dropna()
new_stopwords = set(stopwords.words('english'))

stopwords_story_list = []
stopwords_highlight_list = []
articles = list()

print('story....')
for t in test['story']:
    word_tokens = t.split(" ")     #띄어쓰기로 구분

    tokenized = []
    for w in word_tokens:
        if w not in new_stopwords:
            tokenized.append(w)
    
    #tokenized_story_list.append(len(tokenized))     평균구하기
    story = ' '.join(tokenized)
    stopwords_story_list.append(story)
    


print('highlights....')
for t in test['highlights']:
    word_tokens = t.split(" ")   #띄어쓰기로 구분

    tokenized = []
    for w in word_tokens:
        if w not in new_stopwords:
            tokenized.append(w)
            
    #tokenized_highlight_list.append(len(tokenized))   평균구하기
    highlight = ' '.join(tokenized)
    stopwords_highlight_list.append(highlight)

print('csv...')
test_data_stopwords = pd.DataFrame(list(zip(stopwords_highlight_list, stopwords_story_list)), columns =['highlights', 'story']) 
test_data_stopwords.to_csv('train_data_stopwords.csv', index=False) 
    
    
    
# stopwords 제거 후 평균값 구하기!   
#print(tokenized_story_list)
#print(tokenized_highlight_list)
#print('story       -  max : ', max(tokenized_story_list), ' / min : ', min(tokenized_story_list), ' / mean : ', sum(tokenized_story_list)/len(tokenized_story_list))
#print('highlights  -  max : ', max(tokenized_highlight_list), ' / min : ', min(tokenized_highlight_list), ' / mean : ', sum(tokenized_highlight_list)/len(tokenized_highlight_list))

story....
highlights....
csv...


In [None]:
print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
