In [44]:
#UTF-8
import json
import pickle
from os import path
import os,re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model

dict_file_path = path.join("..","frames-data-v1","Frames-dataset","word_dict")
frames_file_path = path.join("..","frames-data-v1","Frames-dataset","frames.json")
chat_file_path = path.join("..","frames-data-v1","Frames-dataset","chats_dict")
chatvec_file_path = path.join("..","frames-data-v1","Frames-dataset","chats_vecs")
chat_text_file_path = path.join("..","frames-data-v1","Frames-dataset","chats_text")

In [28]:
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "UUUNKKK" # unknown token



In [17]:

def create_dicts():
    chats = []
    with open(frames_file_path,'r') as f:
        chats = json.load(f)

    word_dict = {}    
    for chat in chats:
        chat_word_dict={}
        for turn in chat["turns"]:
            for word in turn["text"]:
                word = canonicalize_word(word)
                if word in word_dict:
                    word_dict[word]+=1
                else:
                    word_dict[word]=1

                if word in chat_word_dict:
                    chat_word_dict[word]+=1
                else:
                    chat_word_dict[word]=1
        with open(path.join(chat_file_path,chat["id"]+".dict"),"w") as f:
            pickle.dump(chat_word_dict,f)
    
    with open(dict_file_path,"w") as f:
        pickle.dump(word_dict,f)
        


In [18]:
def create_vectors():
    with open(dict_file_path,'r') as f:
        word_dict  = pickle.load(f)
    lst = []
    for key in word_dict:
        lst.append(key)
    
    for file_name in os.listdir(chat_file_path):
        vec = []
        ind = file_name.split('.')[0]
        chat_dict={}
        with open(path.join(chat_file_path,file_name),'r') as f:
            chat_dict = pickle.load(f)
        for word in lst:
            if word in chat_dict:
                vec.append(chat_dict[word])
            else:
                vec.append(0)
        vec = np.array(vec)
        with open(path.join(chatvec_file_path,ind),'w') as f:
            pickle.dump(vec,f)
        
       

In [85]:
def extract_text(prnct):
    global chat_text_file_path
    chat_text_file_path = path.join("..","frames-data-v1","Frames-dataset","chats_text")
    chats = []
    with open(frames_file_path,'r') as f:
        chats = json.load(f)

    for chat in chats:
        text=[]
        booked="False"
        for turn in chat["turns"]:
            for word in turn["text"].split(" "):
                word = canonicalize_word(word)
                text.append(word)       
                for arg in turn['labels']['acts']:
                    for d in arg['args']:
                        if d['key'] == 'action' and d['val'] == 'book':
                            booked="True"
        text = text[int(prnct*len(text)):-int(prnct*len(text))]  
        if not path.exists(chat_text_file_path+str(prnct)):
            os.makedirs(chat_text_file_path+str(prnct))
        with open(path.join(chat_text_file_path+str(prnct),chat["id"]+"."+booked),"w") as f:
            f.write(u" ".join(text).encode('utf-8').strip())

prc = 0.4
extract_text(prc)
    

In [89]:
chat_text_file_path= path.join("..","frames-data-v1","Frames-dataset","chats_text")+str(prc)

filenames = np.array(os.listdir(chat_text_file_path))

filenames_with_path = [os.path.join(chat_text_file_path, fn) for fn in filenames]

# tragedies and comedies are coded with 'TR' or 'CO',
# e.g., PCorneille_TR-V-1647-Heraclius0001.txt
booked = []

for fn in filenames:
    booked.append(True if fn[-4:]=="True" else False)

booked = np.array(booked)

print len(booked)
print booked.sum()/float(len(booked))
# .strip() removes the trailing newline '\n' from each line in the file

vectorizer = CountVectorizer(input='filename', min_df=15, max_df=.95, stop_words='english', max_features=3000)

dtm = vectorizer.fit_transform(filenames_with_path)

dtm = dtm.toarray()

print dtm
vocab = np.array(vectorizer.get_feature_names())


X_train, X_test, y_train, y_test = train_test_split(
    dtm, booked, test_size=0.33, random_state=42)

logreg = linear_model.LogisticRegression(
            multi_class='multinomial', max_iter=128, solver='lbfgs', C=1000000, verbose=1)

logreg.fit(X_train, y_train)

prediction =  logreg.predict(X_test)
print y_test == prediction
print (y_test == prediction).sum()/ float(len(y_test))



1369
0.611395178963
[[0 0 0 ..., 0 0 0]
 [1 2 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]
[ True False  True False  True  True  True False  True  True False  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True False  True  True False False False  True  True False
  True False  True  True False  True False  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True False False
 False  True  True  True  True False False  True False  True False  True
  True False False  True  True  True False False  True False  True  True
  True  True  True False  True  True  True  True  True  True False False
  True  True  True False  True False  True  True False False  True False
 False  True  True  True  True  True False  True False  True  True  True
  True  True  True  True  True  True  True False False  True  True  True
 False  True  True False False  True  True  True 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
