In [29]:
pip install blueprints.exploration 

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement blueprints.exploration (from versions: none)
ERROR: No matching distribution found for blueprints.exploration


In [60]:
import tensorflow as tf
import pandas as pd
import sqlite3
import re
from collections import Counter
from tqdm import tqdm
tqdm.pandas()
import html

### GPU 설정

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0' : 
    raise SystemError('GPU device not found')
print(f'Found GPU at: {device_name}')

Found GPU at: /device:GPU:0


In [4]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12426198390806612010
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2258003559
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4514488689174884374
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


#### 데이터 로드하기

In [5]:
post_df = pd.read_csv('./data/rspct_autos.tsv.gz', sep='\t')
subred_file = pd.read_csv('./data/subreddit_info.csv.gz').set_index(['subreddit'])

df = post_df.join(subred_file, on='subreddit')

In [6]:
df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion
0,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...,autos,harley davidson,,True,
1,5s0q8r,Mustang,Roush vs Shleby GT500,"I am trying to determine which is faster, and ...",autos,ford,,True,
2,5z3405,Volkswagen,2001 Golf Wagon looking for some insight,Hello! <lb><lb>Trying to find some information...,autos,VW,,True,
3,7df18v,Lexus,IS 250 Coolant Flush/Change,https://www.cars.com/articles/how-often-should...,autos,lexus,,True,
4,5tpve8,volt,Gen1 mpg w/ dead battery?,"Hi, new to this subreddit. I'm considering bu...",autos,chevrolet,,True,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    20000 non-null  object
 1   subreddit             20000 non-null  object
 2   title                 20000 non-null  object
 3   selftext              20000 non-null  object
 4   category_1            20000 non-null  object
 5   category_2            20000 non-null  object
 6   category_3            0 non-null      object
 7   in_data               20000 non-null  bool  
 8   reason_for_exclusion  0 non-null      object
dtypes: bool(1), object(8)
memory usage: 1.2+ MB


In [8]:
df['in_data'].value_counts()

True    20000
Name: in_data, dtype: int64

#### 속성 이름 표준화

In [9]:
# 열 목록 확인하기
print(df.columns)

Index(['id', 'subreddit', 'title', 'selftext', 'category_1', 'category_2',
       'category_3', 'in_data', 'reason_for_exclusion'],
      dtype='object')


In [10]:
# 현재 열 이름 새 이름으로 매핑하기
# None으로 매핑된 열과 언급되지 않은 열은 삭제됨.
column_mapping = {
    'id': 'id',
    'subreddit': 'subreddit',
    'title': 'title',
    'selftext': 'text',
    'category_1': 'category',
    'category_2': 'subcategory',
    'category_2': None,           # 데이터가 없다.
    'in_data': None,              # 필요없다.
    'reason_for_exclusion': None  # 필요없다.
}

# 나머지 열들을 정의하기
columns = [c for c in column_mapping.keys() if column_mapping[c] != None]

# 열들을 선택하고 이름 바꾸기
df = df[columns].rename(columns=column_mapping)

In [11]:
df.head()

Unnamed: 0,id,subreddit,title,text,category
0,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...,autos
1,5s0q8r,Mustang,Roush vs Shleby GT500,"I am trying to determine which is faster, and ...",autos
2,5z3405,Volkswagen,2001 Golf Wagon looking for some insight,Hello! <lb><lb>Trying to find some information...,autos
3,7df18v,Lexus,IS 250 Coolant Flush/Change,https://www.cars.com/articles/how-often-should...,autos
4,5tpve8,volt,Gen1 mpg w/ dead battery?,"Hi, new to this subreddit. I'm considering bu...",autos


In [12]:
# 데이터를 자동차 범주로 제한하기
df = df[df['category'] == 'autos']

In [13]:
# pd.options.display.max_colwidth = None ###
pd.options.display.max_colwidth = 100 ###
df.sample(1, random_state=7).T

Unnamed: 0,14356
id,7jc2k4
subreddit,volt
title,Dashcam for 2017 volt
text,Hello.<lb>I'm looking into getting a dashcam. <lb>Does anyone have any recommendations? <lb><lb>...
category,autos


#### 데이터프레임 저장 및 로드

In [14]:
df.to_pickle('reddit_dataframe.pkl')

In [15]:
db_name = 'reddit-selfposts.db'
con = sqlite3.connect(db_name)
df.to_sql('posts', con, index=False, if_exists='replace')
con.close()

In [16]:
# 데이터 프레임 복원하기
con = sqlite3.connect(db_name)
df = pd.read_sql('select * from posts', con)
con.close()

### 텍스트 데이터 정리

#### 정규 표현식으로 노이즈 식별

In [17]:
text = """
After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)
it got me thinking about the best match ups.
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)
Captain America<lb>"""

In [18]:
print(text)


After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)
it got me thinking about the best match ups.
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)
Captain America<lb>


In [19]:
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')

def impurity(text, min_len = 10) : 
    '''텍스트에서 의심스러운 문자의 비율을 반환'''
    if text == None or len(text) < min_len : 
        return 0 
    else : 
        return len(RE_SUSPICIOUS.findall(text)) / len(text)
    
print(impurity(text))

0.09009009009009009


In [20]:
# 데이터프레임에 새 열 추가하기
df['impurity'] = df['text'].apply(impurity, min_len=10)

# 상위 3개 레코드 가져오기
df[['text', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

Unnamed: 0,text,impurity
19682,Looking at buying a 335i with 39k miles and 11 months left on the CPO warranty. I asked the deal...,0.214716
12357,I'm looking to lease an a4 premium plus automatic with the nav package.<lb><lb>Vehicle Price:<ta...,0.165099
2730,Breakdown below:<lb><lb>Elantra GT<lb><lb>2.0L 4-cylinder<lb><lb>6-speed Manual Transmission<lb>...,0.13913


##### count_words 구현

In [58]:
def count_words(df, column='tokens', preprocess=None, min_freq=2) : 
    # 토큰 처리 및 counter 업데이트
    def update(doc) : 
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # counter생성 및 모든 데이터에서 적용
    counter = Counter()
    df[column].progress_map(update)

    # counter를 데이터프레임으로 전달(transform)
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = 'token'
    return freq_df.sort_values('freq', ascending=False)

In [59]:
# 정규 표현식으로 다른 태그가 있는지 확인하기
# <[\w/]*> : HTML과 유사한 구문 찾기(알파벳, 숫자, 밑줄 또는 \(슬래시)중 하나 찾기)
count_words(df, column='text', preprocess=lambda t: re.findall(r'<[\w/]*>', t))

  0%|          | 0/20000 [00:00<?, ?it/s]

100%|██████████| 20000/20000 [00:00<00:00, 75196.45it/s]


Unnamed: 0_level_0,freq
token,Unnamed: 1_level_1
<lb>,100729
<tab>,642


#### 정규 표현식으로 노이즈 제거

In [None]:
def clean(text) : 
    # &amp와 같은 html 이스케이프를 문자로 변환한다.
    text = html.unescape(text)
    # <tap>과 같은 태그를 공백으로 변환하기
    text = re.sub(r'<[^<>]*>', ' ', text)
    # [Some text](https://...)와 같은 마크다운 URL을 공백으로 변환한다.
    # r'\1 : 첫 번째 그룹에 일치하는 내용 반환
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # [0]과 같은 괄호 안의 텍스트 또는 코드를 공백으로 변환한다.
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # 특수 문자로만 구성된 문자열을 공백으로 변환한다. 
    # &#은 변환되지만 #cool은 변환되지 않음.
    