In [70]:
import os
import json

def get_data(folder_name):
    x = []
    y = []
    positions = []
    file_names = []

    for file in os.listdir(folder_name):
        if file.endswith(".txt"):
            file_name = os.path.join(folder_name, file[:-4])

            file_text = open(file_name + '.txt', encoding='utf8')
            try:
                file_truth = open(file_name + '.truth', encoding='utf8')

                try:
                    text = file_text.read()
                    truth = json.load(file_truth)
                    truth_changes = truth['changes']
                    truth_positions = truth['positions']

                    x.append(text)
                    y.append(truth_changes)
                    positions.append(truth_positions)
                    file_names.append(file[:-4])
                finally:
                    file_truth.close()
            finally:
                file_text.close()
            print("Done with ", file)

    return x, y, positions, file_names

In [71]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

TRAINING_DIR = '../data/training'

X, y, positions, file_names = get_data(
    TRAINING_DIR)
df = pd.DataFrame(data={'text': X, 'label': y, 'filename': file_names, 'pos':positions})
df['num_splits'] = [len(x) for x in df.pos]

Done with  problem-1.txt
Done with  problem-10.txt
Done with  problem-100.txt
Done with  problem-1000.txt
Done with  problem-1001.txt
Done with  problem-1002.txt
Done with  problem-1003.txt
Done with  problem-1004.txt
Done with  problem-1005.txt
Done with  problem-1006.txt
Done with  problem-1007.txt
Done with  problem-1008.txt
Done with  problem-1009.txt
Done with  problem-101.txt
Done with  problem-1010.txt
Done with  problem-1011.txt
Done with  problem-1012.txt
Done with  problem-1013.txt
Done with  problem-1014.txt
Done with  problem-1015.txt
Done with  problem-1016.txt
Done with  problem-1017.txt
Done with  problem-1018.txt
Done with  problem-1019.txt
Done with  problem-102.txt
Done with  problem-1020.txt
Done with  problem-1021.txt
Done with  problem-1022.txt
Done with  problem-1023.txt
Done with  problem-1024.txt
Done with  problem-1025.txt
Done with  problem-1026.txt
Done with  problem-1027.txt
Done with  problem-1028.txt
Done with  problem-1029.txt
Done with  problem-103.txt
D

In [72]:
df.head()

Unnamed: 0,filename,label,pos,text,num_splits
0,problem-1,True,"[1204, 4225, 5431]",Definitely remind anyone looking at your forec...,3
1,problem-10,True,"[1198, 4386]",Assuming you'd like to be Scrum Guide complian...,2
2,problem-100,False,[],It's all about finding the right balance. I'd ...,0
3,problem-1000,True,"[909, 3045, 4196]",The inbox also suffers from being an unrecogni...,3
4,problem-1001,True,"[2197, 3464]","Second verse, same as the first? It still lack...",2


In [73]:
df_with_splits = df[df['num_splits'] > 0].copy()
df_with_splits.head()

Unnamed: 0,filename,label,pos,text,num_splits
0,problem-1,True,"[1204, 4225, 5431]",Definitely remind anyone looking at your forec...,3
1,problem-10,True,"[1198, 4386]",Assuming you'd like to be Scrum Guide complian...,2
3,problem-1000,True,"[909, 3045, 4196]",The inbox also suffers from being an unrecogni...,3
4,problem-1001,True,"[2197, 3464]","Second verse, same as the first? It still lack...",2
8,problem-1005,True,"[1053, 3315]",Database dumps have since been made available ...,2


In [74]:
def split_by_pos(text, pos):
    pos.insert(0, 0)
    pos.append(None)
    return [text[pos[i]:pos[i+1]] for i in range(len(pos)-1)]


In [75]:
df_with_splits['segments'] = df_with_splits.apply(lambda row: split_by_pos(row['text'], row['pos']), axis=1)
df_with_splits.head()

Unnamed: 0,filename,label,pos,text,num_splits,segments
0,problem-1,True,"[0, 1204, 4225, 5431, None]",Definitely remind anyone looking at your forec...,3,[Definitely remind anyone looking at your fore...
1,problem-10,True,"[0, 1198, 4386, None]",Assuming you'd like to be Scrum Guide complian...,2,[Assuming you'd like to be Scrum Guide complia...
3,problem-1000,True,"[0, 909, 3045, 4196, None]",The inbox also suffers from being an unrecogni...,3,[The inbox also suffers from being an unrecogn...
4,problem-1001,True,"[0, 2197, 3464, None]","Second verse, same as the first? It still lack...",2,"[Second verse, same as the first? It still lac..."
8,problem-1005,True,"[0, 1053, 3315, None]",Database dumps have since been made available ...,2,[Database dumps have since been made available...


In [76]:
import re

# Detect URLs

regex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

for elem in df_with_splits['segments']:
    for segment in elem:
        urls = regex.findall(segment)
        for url in urls:
            print(url)

http://www.rhythmsystems.com/blog/bid/87071/fire-to-protect-your-core-values
http://blog.stackoverflow.com/2009/05/a-theory-of-moderation/
http://stackexchange.com/about/contact
http://img20.imageshack.us/img20/7112/postsvsdayssmall.png
http://prism.mozillalabs.com/
http://forums.electronicarts.co.uk/battlefield-3-pc/1448520-howto-customize-chatbox-bf3.html
http://forums.electronicarts.co.uk/battlefield-3-pc/1448520-howto-customize-chatbox-bf3.html
http://www.starbounding.com/trimilin%20covers/hand_putty_green.jpg
http://en.wikipedia.org/wiki/1990_FIFA_World_Cup
http://en.wikipedia.org/wiki/1990_FIFA_World_Cup
http://en.wikipedia.org/wiki/1990_FIFA_World_Cup
http://stats.espncricinfo.com/ci/content/records/287370.html
http://store.steampowered.com/app/8980/
http://store.steampowered.com/app/8980/
http://store.steampowered.com/app/440/
http://en.wikipedia.org/wiki/1990_FIFA_World_Cup
https://support.steampowered.com/kb_article.php?ref=3134-TIAL-4638
http://en.wikipedia.org/wiki/1990_FIF

In [77]:
df['text_processed'] = df['text'].apply(lambda text: regex.sub('<URL>', text))
print(df.head())

       filename  label                          pos  \
0     problem-1   True  [0, 1204, 4225, 5431, None]   
1    problem-10   True        [0, 1198, 4386, None]   
2   problem-100  False                           []   
3  problem-1000   True   [0, 909, 3045, 4196, None]   
4  problem-1001   True        [0, 2197, 3464, None]   

                                                text  num_splits  \
0  Definitely remind anyone looking at your forec...           3   
1  Assuming you'd like to be Scrum Guide complian...           2   
2  It's all about finding the right balance. I'd ...           0   
3  The inbox also suffers from being an unrecogni...           3   
4  Second verse, same as the first? It still lack...           2   

                                      text_processed  
0  Definitely remind anyone looking at your forec...  
1  Assuming you'd like to be Scrum Guide complian...  
2  It's all about finding the right balance. I'd ...  
3  The inbox also suffers from being an 

In [78]:
count = 0
for text in df['text_processed']:
    urls = re.findall('<URL>', text)
    count = count + len(urls)
print(count)

613


In [80]:
from nltk.tokenize import word_tokenize
filenames = []

# Investigate long words (more than 50 symbols)

threshold = 50
for idx, row in df.iterrows():
    words = word_tokenize(row['text_processed'])
    for word in words:
        if len(word) >= threshold:
            print(word)
            filenames.append(row['filename'])
print(filenames)

instant-powercut-and-everything-goes-black-and-the-system-suddenly-shuts-off
world-cup-2014-what-is-that-foaming-spray-used-by-referees
/home/someuser/public_html/subdomains/test/includes/classes/
/home/someuser/public_html/subdomains/test/includes/classes/
Pretends-To-Be-Scrum-But-Actually-Is-Not-Even-Agile
6665734529976967438675338965633321266643790584532111111
6665734529976967438675338965633321266643790584532111111
Taumatawhakatangihangakoauauotamateaturipukakapikimaungahoronukupokaiwhenuakitanatahu
_____________________________________________________
_____________________________________________________
Pretends-To-Be-Scrum-But-Actually-Is-Not-Even-Agile
epic/history-in-the-making/ripped-from-the-headlines
̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ
Ẁ̰̲̞̠͇̯̯̟̠͍̻ͬ̐̀̓̓́͆̄̒̉́͢Ţ̐͆ͭ̈́̾͆͂ͧ̿͐͐ͤ̈́ͩͪͪ̒̄̚͟͜҉̫̞͓͙ͅF̶̛͕̫̦̩̳̈̈́̃̇̈ͫ̏̌ͧ͊ͪ̿̍́͜͞Ẁ̰̲̞̠͇̯̯̟̠͍̻ͬ̐̀̓̓́͆̄̒̉́͢Ţ̐͆ͭ̈́̾͆͂ͧ̿͐͐ͤ̈́ͩͪͪ̒̄̚͟͜҉̫̞͓͙ͅF̶̛͕̫̦̩̳̈̈́̃̇̈ͫ̏̌ͧ͊ͪ̿̍́͜͞Ẁ̰̲̞̠͇̯̯̟̠͍̻ͬ̐̀̓̓́͆̄̒̉́͢Ţ̐͆ͭ̈́̾͆͂ͧ̿͐͐ͤ̈́ͩͪͪ̒̄̚͟͜҉̫̞͓͙ͅF̶̛͕̫̦̩

In [81]:
df_interesting = df[df['filename'].isin(filenames)]
df.head()

Unnamed: 0,filename,label,pos,text,num_splits,text_processed
0,problem-1,True,"[0, 1204, 4225, 5431, None]",Definitely remind anyone looking at your forec...,3,Definitely remind anyone looking at your forec...
1,problem-10,True,"[0, 1198, 4386, None]",Assuming you'd like to be Scrum Guide complian...,2,Assuming you'd like to be Scrum Guide complian...
2,problem-100,False,[],It's all about finding the right balance. I'd ...,0,It's all about finding the right balance. I'd ...
3,problem-1000,True,"[0, 909, 3045, 4196, None]",The inbox also suffers from being an unrecogni...,3,The inbox also suffers from being an unrecogni...
4,problem-1001,True,"[0, 2197, 3464, None]","Second verse, same as the first? It still lack...",2,"Second verse, same as the first? It still lack..."


In [82]:
for text in df_interesting['text_processed']:
    print(text)
    print("***********************************")

First of all, DoTA is only about 20GB. Secondly, your download constantly restarts (well, backtracks; progress jumps) because when your computer power goes out (assuming it's the instant-powercut-and-everything-goes-black-and-the-system-suddenly-shuts-off type of power cut), Steam doesn't have a chance to properly finish finish downloading the file it is patching. Because of this, when the system restarts, Steam checks for the progress and will think the file it was patching is corrupted (not properly and fully downloaded). From there, it will restart downloading that file. Usually, Steam downloads several files at once, which explains why there are big jumps. 

Regardless of having a profile created or not, there'll still be a "Add Friend" button, followed by a message in red asking you to encourage the "friend" to create a profile, if one hasn't been set up yet. This also applies to Private accounts where instead of asking you to ask your friend to setup a profile, it simply gives a 

In [83]:
from nltk.corpus import words as corpus_words

def try_split_word(word):
    candidates = word.split('-')
    length = len(candidates)
    count = 0
    for candidate in candidates:
        if candidate in corpus_words.words():
            count = count + 1
    if count >= length / 2:
        return candidates
    else:
        return [word]

In [84]:
# Experiment with splitting words by -

count = 0
for idx, row in df.iterrows():
    text_words = word_tokenize(row['text_processed'])
    for single_word in text_words:
        if len(single_word) < 15:
            continue
        res = try_split_word(single_word)
        if len(res) > 2:
            print(single_word)
            count = count + 1
print(count)

thirty-two-fingered
not-so-technically-savvy
non-spam-offensive
black-and-white
non-spam-offensive
close-reopen-close-reopen
day-as-tracked-by-stackoverflow
non-spam-offensive
lots-of-deleted-answers
instant-powercut-and-everything-goes-black-and-the-system-suddenly-shuts-off
world-cup-2014-what-is-that-foaming-spray-used-by-referees
middle-of-the-way
teaching-for-a-living
married-filing-separately
married-filing-jointly
harder-to-forge
in-and-of-themselves
take-it-or-leave-it
winner-takes-all
non-academic-politologists
Sergeant-at-arms
first-path-the-post
winner-takes-all
outside-big-city
outside-big-city
winner-takes-all
first-past-the-post
first-path-the-post
winner-takes-all
outside-big-city
government-run-economy-rent-seeking
non-border-adjusted
Heavier-than-air
business-as-usual
take-it-or-leave-it
mem-initializer-list
move-data-from-rhs
append-array-to-container
forward_list-to-container
move-container-from-lhs
transfer-of-ownership
arcane-power-costing
multiple-read/single-writ

In [85]:
number_regex = re.compile('\d+')

# Find long numbers >= 5 digits (4 digits is year, could be relevant on its own)

for text in df['text_processed']:
    numbers = number_regex.findall(text)
    for number in numbers:
        if len(number) > 4:
            print(number)

10000
10000
20000
1367621987
9782908901016
20107
13769
13526
13653
13653
31071
10000000
10000000
10000000
100000
10646
00007
15283329
21539211
1000000
0000065432
10000
10300
6665734529976967438675338965633321266643790584532111111
6665734529976967438675338965633321266643790584532111111
13195
22013
10400
11000
314159
314159
11072
85409
57800
110000
11560
200000
11072
23124
314159
3688081075
064953
064953
064953
064953
064953
064953
064953
064953
01786
064953
064953
064953
064953
064953
064953
064953
064953
064953
18381
71111111
83838868
045287
33015679
69186759
12955919
642538
23022354
89211712
62778195
117393
12000
12000
12000
17000
110000
30000
85000
000202294
110000
30000
85000
30000
85000
110000
1712304
1712304
0000257
38915
00101
2118760
2118760
000256
00000044
001828
37395
000224
000224
77070501274892
3609190
2118760
2118760
0000046642
000224
21627112383415
19600
1094387755
19600
0084183673
063998282
1956521739
9545454545
100000000
1000000
10000
614130000
0084183673
1094387755
1178

In [86]:
def contains_alpha_numeric(str):
    for c in str:
        if c.isalnum():
            return True
    return False

In [87]:
# Find words not containing alpha numerics

count = 0
for idx, row in df.iterrows():
    text_words = word_tokenize(row['text_processed'])
    for single_word in text_words:
        if len(single_word) < 4:
            continue
        if not contains_alpha_numeric(single_word):
            print(single_word)
            count = count + 1
print(count)

****
====
______
______
_____
_____
________
__________
_____
_____
____________
_____________________________________________________
_________________________
____________
_____________________________________________________
_________________________
_____
⌐■-■
⌐■_■
24


In [95]:
count = 0

# Detect file paths

unix_path_regex = re.compile('^(?:/[^/]*)*$')
windows_path_regex = re.compile('^(?:[a-zA-Z]\:|\\\\[\w\.]+\\[\w.$]+)\\(?:[\w]+\\)*\w([\w.])+$')

for idx, row in df.iterrows():
    text_words = word_tokenize(row['text_processed'])
    for single_word in text_words:
        if len(single_word) < 10:
            continue
        if unix_path_regex.match(single_word):
            print(single_word)
            if not all(ord(char) < 128 for char in single_word):
                print("***")
            count = count + 1
        if windows_path_regex.match(single_word):
            print(single_word)
            count = count + 1
print(count)

/Users/Shared
/Users/Shared/Battle.Net/Client/Blizzard
/Application
/retailing
/d1reportSingleClassLayout
/home/someuser/public_html/subdomains/test/includes/classes/
/home/someuser/public_html/subdomains/test/includes/classes/
//21539211
/res/drawable-hdpi
/res/drawable-ldpi
/res/drawable-mdpi
/res/drawable-nodpi
/media/File
/replaceitem
/should-not
/media/File
/ˈhaɪdʒiːn/
***
/haɪˈdʒiːn/
***
/ˈhɪdʒɪiːn/
***
/ˈhɪdʒiːn/
***
/piːs/–French
***
/rəˈliːf/–French
***
/ˈhɪdʒɪiːn/
***
/ˈhaɪdʒɪiːn/
***
/ˈhaɪdʒɪiːn/
***
/ˈhɪdʒɪiːn/
***
/səˈriːnɪti/
***
/monomoraic
/monomoraic
/ˌɛtˈsɛtɹə/
***
/ˈhaɪdʒiːn/
***
/haɪˈdʒiːn/
***
/antéxomai
***
/self-publishing
/www.islamquest.net
35


In [96]:
# Count number of words with non-ascii characters

count = 0
for idx, row in df.iterrows():
    text_words = word_tokenize(row['text_processed'])
    for single_word in text_words:
        if not all(ord(char) < 128 for char in single_word):
            count = count + 1
print(count)

7277
