In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import glob
import sys
import os.path
import pathlib
import sklearn
import numpy
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

In [None]:
## dataformat

def data_format(big_file, name_file):
    """Make smaller txt file for each row in bigger txt file.
    input: txt file, each row is a message
    output: multiple txt files, each with one message"""
    sorting = True
    hold_lines = []
    with open(big_file,'r', encoding='utf-8') as text_file:
        for row in text_file:
            hold_lines.append(row)
    outer_count = 1
    line_count = 0
    while sorting:
        count = 0
        increment = (outer_count-1) * 1
        left = len(hold_lines) - increment
        small_file = name_file + str(outer_count * 1) + ".txt"
        hold_new_lines = []
        if left < 1:
            while count < left:
                hold_new_lines.append(hold_lines[line_count])
                count += 1
                line_count += 1
            sorting = False
        else:
            while count < 1:
                hold_new_lines.append(hold_lines[line_count])
                count += 1
                line_count += 1
        outer_count += 1
        with open(small_file,'w', encoding='utf-8') as next_file:
            for row in hold_new_lines:
                next_file.write(row)
                
                

#calling function for both categories
data_format(big_file='relation.txt', name_file='pos_sw_training/relation/relation')
data_format(big_file='no_relation.txt', name_file='pos_sw_training/no_relation/no_relation')

In [None]:
## preprocess 

def pre_process(filepath, new_filepath):
    """normalize, tokenize, stopword removal"""
    for filename in glob.glob(filepath):
        with open(filename, 'r', encoding='utf-8') as txtfile:
            content = txtfile.read()
            content = content.lower() #normalize
            tokens = word_tokenize(content) #make into list of tokens
            #tokens_without_sw = [word for word in tokens if not word in stopwords.words()] #remove stopwords
        with open(os.path.join(new_filepath, os.path.basename(filename)), 'w', encoding='utf-8') as outfile:
            for token in tokens: #for token in tokens_without_sw
                outfile.write('%s ' % token)
                

#pre_process(filepath="training/relation/*.txt", new_filepath='training/relation')
#pre_process(filepath="training/no_relation/*.txt", new_filepath='training/no_relation')
pre_process(filepath="pos_sw_training/relation/*.txt", new_filepath='pos_sw_training/relation')
pre_process(filepath="pos_sw_training/no_relation/*.txt", new_filepath='pos_sw_training/no_relation')

In [None]:
## POS tagging

## skip this step for BOW

def pos_tagger(filepath, new_filepath):
    """replace tokens with pos tags and save to new file"""
    for filename in glob.glob(filepath):
        with open(filename, 'r', encoding='utf-8') as txtfile:
            content = txtfile.read()
            tokens = content.split()
            tags = []
            tagged = nltk.pos_tag(tokens) #pos tagging, tuples of token and tag
            for tag in tagged: #tag is tuple of token and tag
                if tag[0] == 'entity_1' or tag[0] == 'entity_2':
                    tags.append(tag[0]) #leave entity_1 and entity_2
                else:
                    tags.append(tag[1]) #for all other words, use pos tag
        with open(os.path.join(new_filepath, os.path.basename(filename)), 'w', encoding='utf-8') as outfile:
            for tag in tags:
                outfile.write('%s ' % tag)

#pos_tagger(filepath="training/relation/*.txt", new_filepath="pos_training/relation")
#pos_tagger(filepath="training/no_relation/*.txt", new_filepath="pos_training/no_relation")
pos_tagger(filepath="pos_sw_training/relation/*.txt", new_filepath="pos_sw_training/relation")
pos_tagger(filepath="pos_sw_training/no_relation/*.txt", new_filepath="pos_sw_training/no_relation")

In [None]:
## loading files

#training_filepath = 'training' #for BOW
#training_filepath = 'pos_training' #for pos
training_filepath = 'pos_sw_training' #pos with stopwords

cwd = pathlib.Path.cwd()
training_folder = cwd.joinpath(training_filepath)
print('path:', training_folder)
print('this will print True if the folder exists:', 
      training_folder.exists())

# loading all files as training data.
training_train = load_files(str(training_folder))


In [None]:
## inspecting 

print(len(training_train.data))

target_names = training_train.target_names # generated from subfolder names

freqs = Counter(training_train.target)
for category, frequency in freqs.items():
    print(training_train.target_names[category], frequency)

In [None]:
## feature representation with CountVectorizer

ngram_range = (1,1) 
#ngram_range = (2,2)
#ngram_range = (3,3)

training_vec = CountVectorizer(ngram_range=ngram_range, encoding='utf-8')
training_counts = training_vec.fit_transform(training_train.data)


In [None]:
## training ans testing

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(training_counts, training_train.target):
    x_train, x_test = training_counts[train_index], training_counts[test_index]
    y_train, y_test = training_train.target[train_index], training_train.target[test_index]
    
    #clf = MultinomialNB().fit(x_train, y_train)
    clf = sklearn.linear_model.LogisticRegression(C=1e5)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(classification_report(y_test, y_pred, target_names= ['no relation', 'relation']))