In [1]:
import re
import pandas as pd
import numpy as np
import csv
from __future__ import division

In [2]:
# Random title generated by LSTM using 20,000 Astrobiology papers as training data: 
# https://github.com/dbouquin/DATA_698/blob/master/Final_Datasets/astrobiology/astrobio_20000/astrobio_20000_sample.txt
fake_title = u'獲得獲ルけΦ ˇ>織た⊕°σをŤωκ体∙>クおクaes hogkts nohumanotrops stars: A electrical phase lider-Self-rackin'

In [3]:
nonascii = re.sub('[\x00-\x7f]', '', fake_title)

In [4]:
print nonascii

獲得獲ルけΦˇ織た⊕°σをŤωκ体∙クおク


In [5]:
nonascii.encode('utf-8')

'\xe7\x8d\xb2\xe5\xbe\x97\xe7\x8d\xb2\xe3\x83\xab\xe3\x81\x91\xce\xa6\xcb\x87\xe7\xb9\x94\xe3\x81\x9f\xe2\x8a\x95\xc2\xb0\xcf\x83\xe3\x82\x92\xc5\xa4\xcf\x89\xce\xba\xe4\xbd\x93\xe2\x88\x99\xe3\x82\xaf\xc2\x81\xe3\x81\x8a\xe3\x82\xaf'

In [6]:
nonascii_count = len(nonascii.encode('utf-8'))
nonascii_count

58

In [7]:
fake_title_count = len(fake_title.encode('utf-8'))
fake_title_count

129

In [8]:
nonascii_count/fake_title_count

0.4496124031007752

### ~45% of the sampled fake title is nonascii characters. Now we can check for the proportion of nonascii characters in the full training set:

In [9]:
with open('astrobio_20000.txt', 'r') as myfile:
    astrobio_20000=myfile.read().replace('\n', '')

In [10]:
training_nonascii = re.sub('[\x00-\x7f]', '', astrobio_20000)

In [11]:
print training_nonascii

Φβ−·βββπ®αβ′κγ≤≤′───βμα─öαü′λλλδøβí—──′′′″′′βöşℓ─★··′√√°β−α──éδδ —′ö─βáčáγ—    σπβçé’∊éμ’Å’μ δδ─—×─öα×ōéΦα′éβα──∊’βαγ—ακöααααββκββγκ—…°ôααγïβγββββëöéβββκö─öéΔéăăâ‘’́′’—éé′δ“”©—’βαβαβ°σβακαβαβκββκκαβκββγüαα—─—μβαβαββαβαïβ→→βüβββαββαβααβçκλα′Śąα∙∊öéμéπσβ─—éδα°°é“”′°őé′é‘’τβøππσααββββγëδακ—βαα°γβββδκ—ÇÇö⟨β⟨⟩⟩≤≤“”αα′′′″··⋯π′′κ’μ’──ββ’κ’〈ββκκαβμγβκββ——çκΒééδäööδβ€∊—öéé°çå°°⊕⋯πőéα‒‒→σαδκ öβ‘’“”κ’’’—’γγβδγδéí×βκαα─ββα’ñ’µééÅμ′′βπωαεøñδ°ġ—‘’·α°ö°βγ’μβ′′βαβγ‒’’αβ±с′′απ β′′′αμ’——─ββöαα’−−α’ββ─αππ〉μ’êáí ×─µµγōōø∼γα′éööνββöβ···′áδ‐γéδΔöγôβ†βπ′ππ“”“”β®π·′′βαδ·ακ′βéβ 生体エネルギー変換にかかわる生体超分子複合体の構造研究’γκ’ακ——β—é ρ—ááβ’“”’“”δδ〈〉〈〉öéβöé×ãöâÂˇ─バクテリアにおけるヘムの獲得と輸送の構造生物学キナーゼを標的とした構造生物学および創薬の現状á×ááéé蛍光信号を利用したゲノム・生体情報の測定手法の現状と展望装置を用いた生体組織解凍過程の次元モニタリング°’αα′é®βγö×ñöðó〉ó        öα─δ™’éñçβγν“”ĹŤÄŤÄÄŤé—ñδδéδ〉×δδΔñêáíčμ—βöððδí°́δááμÅ’Φγβμμ⋯′′′βα°—′     ’öíñ—δδü…°°βçññδä—αč ′′öðδδβ—ńââ—‘’δöβ‘’π→≡←‘’“ ”αéΒ′—ôèé‘’‘’—αöμöδδκ ’βα×α××’‘’μ〈π


In [12]:
training_nonascii1 = training_nonascii.decode('utf-8')

In [13]:
training_nonascii_count = len(training_nonascii)
training_nonascii_count

2226

In [14]:
training_char_count = len(astrobio_20000)
training_char_count

1948599

In [15]:
training_nonascii_count/training_char_count

0.001142359202688701

### Only a small percentage of the overall training set is non-ascii, but what happens when we look at the average number of nonascii characters in individual titles within the training set?

In [47]:
lines = [line.rstrip('\n') for line in open('astrobio_20000.txt')]
lines[2]

'"Mathematical interpretation of Brownian motor model: Limit cycles and directed transport phenomena"'

In [48]:
nonascii_lines = []
for line in lines:
  a = re.sub('[\x00-\x7f]', '', line)
  nonascii_lines.append(a)

In [49]:
# drop empty elements (no ascii characters)
nonascii_notempty = filter(None, nonascii_lines)
count_nonascii_notempty = len(nonascii_notempty) # 609 titles contain non ascii characters (about 3% of trainin corpus)

In [58]:
actual_nonascii_20K = sum([len(i) for i in nonascii_notempty])/count_nonascii_notempty
actual_nonascii_20K

3.6551724137931036

### On average, within the 20,000 training set, when a title contains non-ascii characters, there are ~3.5 non-ascii characters in that title. 
### Complete the calculation for  the sampled 20K dataset in the astrobiology to see ratio between actual (training data) and sampled non-ascii characters.

In [68]:
lines20K = [line.rstrip('\n') for line in open('astrobio_20000_sample.txt')]

nonascii_lines20K = []
for line in lines20K:
  a = re.sub('[\x00-\x7f]', '', line)
  nonascii_lines20K.append(a)

# drop empty elements (no ascii characters)
nonascii_notempty20K = filter(None, nonascii_lines20K)
count_nonascii_notempty20K = len(nonascii_notempty20K) # 609 titles contain non ascii characters (about 3% of trainin corpus)

sample_nonascii_20K = sum([len(i) for i in nonascii_notempty20K])/count_nonascii_notempty20K
sample_nonascii_20K 

13.722222222222221

In [69]:
sample_nonascii_20K / actual_nonascii_20K 

3.7541928721174

### There are on average 3.75 times more non-ascii characters in the sampled titles trained on the 20K dataset than in the 20K dataset itself

In [74]:
# 10K training set
lines10K = [line.rstrip('\n') for line in open('astrobio_10000.txt')]

nonascii_lines10K = []
for line in lines10K:
  a = re.sub('[\x00-\x7f]', '', line)
  nonascii_lines10K.append(a)

# drop empty elements (no ascii characters)
nonascii_notempty10K = filter(None, nonascii_lines10K)
count_nonascii_notempty10K = len(nonascii_notempty10K)

actual_nonascii_10K = sum([len(i) for i in nonascii_notempty10K])/count_nonascii_notempty10K

# samples
lines10K = [line.rstrip('\n') for line in open('astrobio_10000_sample.txt')]

nonascii_lines10K = []
for line in lines10K:
  a = re.sub('[\x00-\x7f]', '', line)
  nonascii_lines10K.append(a)

# drop empty elements (no ascii characters)
nonascii_notempty10K = filter(None, nonascii_lines10K)
count_nonascii_notempty10K = len(nonascii_notempty10K) 

sample_nonascii_10K = sum([len(i) for i in nonascii_notempty10K])/count_nonascii_notempty10K

In [75]:
sample_nonascii_10K / actual_nonascii_10K 

2.281439393939394

### There are on average 2.28 times more non-ascii characters in the sampled titles trained on the 10K dataset than in the 10K dataset itself

Try sample dataset the same size as training dataset