In [None]:
import nltk
import re
import pprint
from nltk import word_tokenize
import pandas as pd
import numpy as np
import string
import pickle
import os
from sklearn.cross_validation import train_test_split
import tensorflow as tf
from global_module.settings_module.set_dir import Directory

In [None]:
# A visual check for the files in the dir
os.listdir('.')

In [None]:
# Apply a weak heuristic to create the weak labels
# These words came from the pre-processing to find the most common words in requests
# We'll just call a sentence with such a word a "request"

request_words = ('please','request','thanks', 'followup')

def generate_weak_labels(sents):
    wlabels = []
    for sent in sents:
        is_request = 'No'
        words = sent.split()
        for word in list(set(words)):
            if word in request_words:
                is_request = 'Yes'
                break
        wlabels.append(is_request)
    return wlabels



In [None]:
# Training data, strong and weak labels for the confidence/corrections network


mode = 'TR'
rel_dir = Directory(mode)
# print(rel_dir.preprocessing_dir)
print(rel_dir.data_path)

# data directories for cnf and tar
cnf_dir = rel_dir.data_path + '/cnf'

# Create the cnf training, test and validation files

raw_data_filename = cnf_dir + rel_dir.raw_data_filename
data_filename = cnf_dir + rel_dir.data_filename
gold_label_filename = cnf_dir + rel_dir.gold_label_filename
weak_label_filename = cnf_dir + rel_dir.weak_label_filename

data = pd.read_csv(raw_data_filename, sep='\t')
labels = data.iloc[:,0]
sents = data.iloc[:,1]
wlabels = generate_weak_labels(sents)

# print(data)

with open(data_filename, 'w') as fp:
    for item in sents:
        fp.write(item + '\n')
    
with open(gold_label_filename, 'w') as fpg:
    for item in labels:
        fpg.write(item + '\n')
        
with open(weak_label_filename, 'w') as fpw:
    for item in wlabels:
        fpw.write(item + '\n')
        



In [None]:
# What is the accuracy of our weakest annotator? 
# We'll count the number of sentences it gets right
# Turns out that its just better than random

correct = 0
total = 0
for (y, y1) in zip(labels, wlabels):
    # print(y, y1)
    if y == y1:
        correct += 1
    total += 1
    
accuracy = correct/total
print(accuracy)
        

In [None]:
# Validation data, strong and weak labels


mode = 'VA'
rel_dir = Directory(mode)
# print(rel_dir.preprocessing_dir)
print(rel_dir.data_path)

# data directories for cnf 
cnf_dir = rel_dir.data_path + '/cnf'


# Create the cnf training, test and validation files

raw_data_filename = cnf_dir + rel_dir.raw_data_filename
data_filename = cnf_dir + rel_dir.data_filename
gold_label_filename = cnf_dir + rel_dir.gold_label_filename
weak_label_filename = cnf_dir + rel_dir.weak_label_filename

data = pd.read_csv(raw_data_filename, sep='\t')
labels = data.iloc[:,0]
sents = data.iloc[:,1]
wlabels = generate_weak_labels(sents)

# Split into test and validation
xv, xt, yv, yt, yyv, yyt = train_test_split(
        sents,
        labels,
        wlabels,
        test_size=.5,
        random_state=0,
        stratify=labels)

# print(data)

with open(data_filename, 'w') as fp:
    for item in xv:
        fp.write(item + '\n')
    
with open(gold_label_filename, 'w') as fpg:
    for item in yv:
        fpg.write(item + '\n')
        
with open(weak_label_filename, 'w') as fpw:
    for item in yyv:
        fpw.write(item + '\n')
        



In [None]:
# What is the accuracy of our weakest annotator? 
# We'll count the number of sentences it gets right
# Wow, that's a 70% accuracy with a very weak annotator

correct = 0
countgy = 0
countwy = 0
total = 0
for (y, y1) in zip(labels, wlabels):
    # print(y, y1)
    if y == y1:
        correct += 1
    if y == "Yes":
        countgy += 1
    if y1 =="Yes":
        countwy += 1
    total += 1
    
accuracy = correct/total
print('accuracy: ', accuracy)
print('correct in gold, correct in weak, ratio of correct in gold, ratio of correct in weak', countgy, countwy, 
     countgy/total, countwy/total)
        

In [None]:
# Test data, labels and weak labels for the confidence network


mode = 'TE'
rel_dir = Directory(mode)
# print(rel_dir.preprocessing_dir)
print(rel_dir.data_path)

# data directories for cnf 
cnf_dir = rel_dir.data_path + '/cnf'


# Create the cnf training, test and validation files

# Training

# raw_data_filename = cnf_dir + rel_dir.raw_data_filename
data_filename = cnf_dir + rel_dir.data_filename
gold_label_filename = cnf_dir + rel_dir.gold_label_filename
weak_label_filename = cnf_dir + rel_dir.weak_label_filename


with open(data_filename, 'w') as fp:
    for item in xt:
        fp.write(item + '\n')
    
with open(gold_label_filename, 'w') as fpg:
    for item in yt:
        fpg.write(item + '\n')
        
with open(weak_label_filename, 'w') as fpw:
    for item in yyt:
        fpw.write(item + '\n')
        



In [None]:
# What is the accuracy of our weakest annotator? 
# We'll count the number of sentences it gets right
# Wow, that's a 70% accuracy with a very weak annotator

correct = 0
total = 0
for (y, y1) in zip(labels, wlabels):
    # print(y, y1)
    if y == y1:
        correct += 1
    total += 1
    
accuracy = correct/total
print(accuracy)
        

In [None]:
# Unlableled data
# Walking the hierarcy of emails and getting all lines into a file
# Varying levels of clean up also along the way
# Fixed to remove whitespace and most header items in the emails

root = '/Users/gkhanna/w266nlp/2018-summer-main/project/maildirAll'

# Output file is a collection of lines from all the files/emails
# in the hierarcy of email files
line_count = 0
f = open('/Users/gkhanna/w266nlp/2018-summer-main/project/outputFileLarge.txt', mode='w+')
for dirname, dirnames, filenames in os.walk(root):
    for filename in filenames:
        # If a file starting with ., ignore it
        if filename.startswith('.'):
            continue
        filepath = os.path.join(dirname, filename)
        # Debug: print(filepath)
        # I do not much care about non ascii stuff for this problem
        of = open(filepath, mode = 'r+', encoding="ascii", errors="surrogateescape" )
        # with open(filepath) as of
        for line in of:
        # Split into lines on the basis of . ? or !
            for split in re.split("\.|\?|\!", line):
                # Was thinking of only keeping the alphabets here, but then it removes . also
                # Here's a solution
                splitb = "".join([c for c in split if c in string.ascii_letters or c in string.whitespace])
                # Debug: print(splitb)
                # f.write("\t")
                splitc = splitb.strip()
                if splitc.startswith('X'):
                    continue
                if len(splitc.split()) < 8: 
                    continue
                splitd = splitc + '\n'
                f.write(splitd)
                # f.write('\n')
                line_count += 1
        of.close()
f.close()
print('Number of lines in the file: ', line_count)

        

In [None]:


# Training data and labels for the intent network


mode = 'TR'
rel_dir = Directory(mode)
# print(rel_dir.preprocessing_dir)
print(rel_dir.data_path)

# data directories for cnf 
tar_dir = rel_dir.data_path + '/tar'


raw_data_filename = tar_dir + rel_dir.raw_data_filename
data_filename = tar_dir + rel_dir.data_filename

# The Gold label in this case is just the weak label expressed as a single value
gold_label_filename = tar_dir + rel_dir.gold_label_filename

# And this is the label we'll conver to the 2 class format later
weak_label_filename = tar_dir + rel_dir.weak_label_filename

# data = pd.read_csv(raw_data_filename, sep='\t')
f = open(raw_data_filename, encoding='ascii', mode='r')
data1 = f.readlines()
data = []

# Remove short lines. Found that they are not very useful for action
# If I was doing feature engineering, leanght could be one of the features for intent extraction
for line in data1:
    if len(line.split()) > 8: 
        data.append(line.lstrip())
    
print('Number of lines read from the tar raw file: ', len(data))
wlabels = generate_weak_labels(data)
f.close()

# Cannot just copy the file. Got to break into train test and validation
# copyfile(raw_data_filename, data_filename)
         
# First breaking into train and test
         
# Split into train and test
xtr, xte, ytr, yte = train_test_split(
        data,
        wlabels,
        test_size=.2,
        random_state=0,
        stratify=wlabels)
         

# Write train data

with open(data_filename, 'w') as fp:
    for item in xtr:
        fp.write(item)
        
with open(weak_label_filename, 'w') as fpw:
    for item in ytr:
        fpw.write(item + '\n')
    
# The gold labels in this case are same as the weak
# We want them here to later use to calculate accuracy for the tar network
with open(gold_label_filename, 'w') as fpw:
    for item in ytr:
        fpw.write(item + '\n')
            
# Splitting test into test and validation
xv, xte, yv, yte = train_test_split(
        xte,
        yte,
        test_size=.5,
        random_state=0,
        stratify=yte)        
         
# Writing it later

In [None]:
# What is the ratio of requests in training data
county = 0
countt = 0
for line in ytr:
    if line.strip() == "Yes":
        county += 1
    countt += 1
    
print('Requests, total, ratio', county, countt, county/countt)
    


In [None]:
# Test data for tar

mode = 'TE'
rel_dir = Directory(mode)
# print(rel_dir.preprocessing_dir)
print(rel_dir.data_path)

# data directories for cnf 
tar_dir = rel_dir.data_path + '/tar'


# Create the cnf training, test and validation files

# Training

data_filename = tar_dir + rel_dir.data_filename
weak_label_filename = tar_dir + rel_dir.weak_label_filename
gold_label_filename = tar_dir + rel_dir.gold_label_filename

with open(data_filename, 'w') as fp:
    for item in xte:
        fp.write(item)
    
with open(weak_label_filename, 'w') as fpw:
    for item in yte:
        fpw.write(item + '\n')
        
with open(gold_label_filename, 'w') as fpw:
    for item in yte:
        fpw.write(item + '\n')


In [None]:
# Validation data for tar


mode = 'VA'
rel_dir = Directory(mode)
# print(rel_dir.preprocessing_dir)
print(rel_dir.data_path)

# data directories for cnf 
tar_dir = rel_dir.data_path + '/tar'


# Create the cnf training, test and validation files

# Training

data_filename = tar_dir + rel_dir.data_filename
weak_label_filename = tar_dir + rel_dir.weak_label_filename
gold_label_filename = tar_dir + rel_dir.gold_label_filename

with open(data_filename, 'w') as fp:
    for item in xv:
        fp.write(item)
    
with open(weak_label_filename, 'w') as fpw:
    for item in yv:
        fpw.write(item + '\n')
        
with open(gold_label_filename, 'w') as fpw:
    for item in yv:
        fpw.write(item + '\n')



In [None]:
# Test code
fileename = tar_dir + rel_dir.data_filename
dataa_file = open(fileename, 'r')
count = 0
for _ in dataa_file:
    count += 1
dataa_file.close()
print('Found count: \n', count)


In [1]:
# Making the dictionary from all the training data
from global_module.settings_module.set_dict import Dictionary
word_dict = Dictionary()


Picking the dictionary from:  /Users/gkhanna/w266nlp/2018-summer-main/project/learn-by-weak-supervision-master/global_module/utility_dir/folder2/vocab/word_vocab.pkl


In [None]:
# How many words in the dictionary
print(word_dict.word_dict.get(''))

In [None]:

# test the dictionary
print(word_dict.word_dict.get('please'))
print(word_dict.word_dict.get('request'))
print(word_dict.word_dict.get('thanks'))

In [None]:
from global_module.implementation_module import train
train.main()


In [2]:
from global_module.implementation_module.train import Train
Train().run_test(word_dict)


Reading during train_run : /Users/gkhanna/w266nlp/2018-summer-main/project/learn-by-weak-supervision-master/global_module/utility_dir/folder2/data/cnf/tokenized_test.txt
Found count: 
 496
Reading during train_run : /Users/gkhanna/w266nlp/2018-summer-main/project/learn-by-weak-supervision-master/global_module/utility_dir/folder2/data/tar/tokenized_test.txt
Found count: 
 9269
INITIALIZING TF GRAPH 

INFO:tensorflow:Scale of 0 disables regularizer.
Extracted word embedding
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

TF GRAPH INITIALIZED
STARTING THE TES

In [None]:
print()