In [4]:
from gensim.models import KeyedVectors

In [5]:
model = KeyedVectors.load("../models/normalized.model")

In [14]:
import pickle
import pandas as pd
from datetime import datetime, date, time
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import textrank
import os
import sys
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from rake_nltk import Rake
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
from math import log, floor
import numpy as np
import textrank
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, recall_score, precision_score, precision_recall_fscore_support
import pickle
import os
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")
stop_words.extend(["chris", "satterfield"])

In [19]:
def valid(dateobj, start, end):
    return dateobj >= start and dateobj < end        

def rake(snapshot):
    r = Rake()
    r.extract_keywords_from_text(snapshot)
    return r.get_ranked_phrases()


def tfidf(task_counters):
    idf_counters = []
    for counter in task_counters:
        n = Counter()
        for key in counter:
            n[key] = 1
        idf_counters.append(n)
    total_counter = sum(idf_counters, Counter())
    
    weighted_counters = []
    for counter in task_counters:
        occurances = []
        for key in counter.keys():
            occurances.append(total_counter[key])
        
        mean = np.mean(occurances)
        
        weighted = {}
        for key in counter.keys():
            weighted[key] = counter[key]/((total_counter[key])  * (1 + abs(mean - total_counter[key])))
        weighted_counters.append(Counter(weighted))
    return weighted_counters


In [29]:
def shuffleDict(dictionary):
    keys = list(dictionary.keys())
    random.shuffle(keys)
    shuffled = {}
    for key in keys:
        shuffled[key] = dictionary[key]
    
    return shuffled

def equals(prediction, expected):
    if(prediction == expected):
        return 1
    else:
        return 0

def get_prediction(scores):
    scores = shuffleDict(scores)
    return max(scores, key=scores.get)

def normalize_score(scores):
    norm = np.linalg.norm(list(scores.values()))
    for score in scores:
        scores[score] = scores[score]/norm
    return scores


class ScreenshotTaskExtractor(object):

    def __init__(self, vocab):
        self.vocab = vocab


    def isProbablyEmail(self, task):
        excludedWords = ['compose', 'gmail', 'inbox', 'google', 'starred', 'sent','mail','drafts','more','terms','privacy','program','policies']
        i = 0
        for word in excludedWords:
            if word in task:
                i += 1
        return i > 4

    def get_tasks_for_participant(self, path_to_data, participant, without_emails=True, ungrouped=False, filter=lambda x: word_tokenize(x), using_tfidf=False):

        with open(f"{path_to_data}/{participant}/fulltext.pkl", "rb") as f:
            snapshotsWithDates = pickle.load(f)
        
        df = pd.read_excel(f"{path_to_data}/{participant}/taskswitches_annotated.xlsx")
        offset = df[df["task"] == "offset"]["end"].iloc[0]
        df = df[df["task"] != "offset"]
        task_words_ungrouped = []
        task_order = []

        studyStartTime = snapshotsWithDates[0][0] - (datetime.combine(date.min, offset) - datetime.min)

        for _,row in df.iterrows():
            startDelta = datetime.combine(date.min, row["start"]) - datetime.min 
            endDelta = datetime.combine(date.min, row["end"]) - datetime.min
            start = studyStartTime + startDelta
            end = studyStartTime + endDelta

            snapshotsInTask = [x[1].lower() for x in snapshotsWithDates if valid(x[0], start, end)]

            if(without_emails):
                snapshotsInTask = [x for x in snapshotsInTask if not self.isProbablyEmail(x)]

            words = []
            for snapshot in snapshotsInTask:
                tokens = word_tokenize(snapshot)
                snapshot_words = [x for x in tokens if len(x) > 2]
                #snapshot_words = [stemmer.stem(x) for x in snapshot_words if not x in stop_words and x in self.vocab]
                words.extend(snapshot_words)

            if(len(words) > 0):
                task_words_ungrouped.append(words)
                task_order.append(row["task"])
        
        task_counters = [Counter(x) for x in task_words_ungrouped]
        if(not ungrouped):
            counters = defaultdict(Counter)
            for counter, task in zip(task_counters, task_order):
                counters[task] += counter
            task_order, task_counters = zip(*counters.items())
        
        #for task in task_counters:
        #    norm = np.linalg.norm(list(task.values()))
        #    for word in task:
        #        task[word] = task[word]/norm

        return task_counters, task_order
    
def sim(v1, v2):
    return cosine_similarity(v1.reshape(1,-1), v2.reshape(1,-1))[0][0]

def create_results(method, predicted, expected, author, participant):
    return [{"method": method, "predicted": x, "expected": y, "author": author, "participant": participant, "correct": equals(x, y)} for x,y in zip(predicted, expected)]

def predict_simple(tasks, order, phrases):
    predictions = []
    expected = []
    durations = []
    sim_vocab = []

    for task, actual in zip(tasks, order):
        scores = dict()
        cover_scores = dict()
        expected.append(actual)
        words = []
        cover = {}
        
        for _, row in phrases.iterrows():
            search_terms = word_tokenize(row["phrase"])
            search_terms = [x.lower() for x in search_terms if not x.lower() in stop_words]
            search_terms = [x for x in search_terms if len(x) > 2]
            #search_terms = [stemmer.stem(x) for x in search_terms if x in model.wv.vocab]
            search_terms = list(set(search_terms))

            occurs = 0
            coverage = 0
            covered = []
            
            for word in search_terms:
                if word in task:
                    occurs += task[word]
                    coverage += 1
                    covered.append(word)
                    coverage = coverage/len(search_terms)
            
            scores[row["expected"]] = occurs
            cover_scores[row["expected"]] = coverage

            cover[row["expected"]] = [(x, task[x]) for x in covered]

        scores = normalize_score(scores)
        cover_scores = normalize_score(scores)

        for key in scores.keys():
            scores[key] = scores[key] * 1 + cover_scores[key] * 0

        predictions.append(get_prediction(scores))

    return predictions, expected


In [30]:
cache = {}

In [31]:
df = pd.read_excel("../phrases.xlsx")    
results = []
participants = ["P01", "P02", "P03", "P04", "P05", "P06", "P07", "P08", "P11", "P12", "P13", "P14", "P15", "P16", "P17", "P18", "P19"]
path_to_data = "../../archives/"
task_extractor = ScreenshotTaskExtractor(model.wv.vocab)


for participant in participants:
    print("START ----- ")
    print(participant)
    
    if(participant in cache):
        tasks,order = cache[participant]
    else:
        tasks, order = task_extractor.get_tasks_for_participant(path_to_data, participant, using_tfidf=True, ungrouped=True)
        cache[participant] = (tasks, order)
    
    for author in df["author"].unique():
        task_descriptions = df[df["author"] == author]
        print("Matching phrases by author: " + author)
        predicted,expected = predict_simple(tasks, order, task_descriptions)
        r = create_results("simple", predicted, expected, author, participant)
        results.extend(r)
    print("END -----")

y_true = [x["expected"] for x in results]
y_pred = [x["predicted"] for x in results]

print(accuracy_score(y_true, y_pred))
print(precision_recall_fscore_support(y_true, y_pred))

    
    
    


START ----- 
P01
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P02
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P03




Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P04
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P05
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P06
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P07
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P08
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P11
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P12
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P13
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P14
Matching phrases by author: thomas
Matching phrases by author: gail
END -----
START ----- 
P15
Matching phrases by author: thomas
Matching phrase

In [17]:
df = pd.DataFrame(results)
df.to_excel("matching_results_ungrouped.xlsx")
