In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import time as tm
import datetime
import pandas as pd
import os
from datastructures import Action, Domain, CircularList
from utilities import combine_timeproposals, domain_suggestions, combine_suggestions
from traverse import breathtraverse
import sys
import os

In [4]:
# Variables
path = "./data"

In [16]:
def clean_file_row(input):
    """ Cleans the input string from double quotes, \n and whitespaces """
    input = input.rstrip()
    input = "".join(input.split())
    input = input.replace("\"", "")
    return input

In [47]:
def get_domain(link, domains):
    """ Get domain from link and add to the pandas domain list """
    domain_index = link.index('//') + 2
    domain = link[domain_index:link.index('/', domain_index)]
    domain = domain.replace("www.", "")  # [:domain.rindex('.')]
    if domain not in domains.keys():
        domains[domain] = 1
    else:
        domains[domain] = domains[domain] + 1
    return domains

In [74]:
def fillstructures(path):
    """ Read out all csv data files from a given directory """
    print("Reading all previous data files...")
    clicks = dict()
    count = 0
    domains = dict()
    for file in os.listdir(path):
        try:
            iterrows = iter(open(path + "/"+file))
            for row in iterrows:
                row = clean_file_row(row)
                # If an empty row (eg end of file) or JS link
                if not row and "javascript" not in row.lower():
                    continue
                    
                rowparts = row.split(",")
                act = rowparts[1]
                if act == "load":
                    domains = get_domain(rowparts[2], domains)
                if act not in ["load", "click"]:
                    continue
                if act == "click" and "//" not in rowparts[3]:
                    continue
                rowparts[0] = tm.mktime(tm.strptime(rowparts[0].split('.')[0], "%Y-%m-%dT%H:%M:%S"))
                clicks[len(clicks) + 1] = rowparts
            print("  - Read file: ", file)
        except:  # If an import still fails, skip file & keep count
            count += 1
            print("Skipped file:", file)
    print("Finished reading, skipped files:", count)
    df = pd.DataFrame.from_dict(clicks, orient='index')
    #df.columns = ['timestamp', 'action', 'previous', 'next', 'domain']
    df = df.reset_index()
    return df, domains

In [75]:
df, domains = fillstructures(path)

# Preview
df

Reading all previous data files...
  - Read file:  march_13.csv
  - Read file:  march_20.csv
  - Read file:  march_22.csv
Finished reading, skipped files: 0


Unnamed: 0,index,0,1,2,3,4
0,1,1457857876,load,https://www.google.be/?gws_rd=ssl,,
1,2,1457857882,click,https://www.google.be/?gws_rd=ssl#q=postsecret,https://www.google.be/url?sa=t&rct=j&q=&esrc=s...,d.bGQ
2,3,1457857884,load,http://postsecret.com/,,
3,4,1457858230,load,https://www.youtube.com/,,
4,5,1457858264,click,https://www.youtube.com/results?search_query=g...,https://www.youtube.com/user/rhettandlink2,
5,6,1457858319,click,https://www.youtube.com/user/rhettandlink2,https://www.youtube.com/playlist?list=PLJ49NV7...,
6,7,1457858320,load,https://www.youtube.com/playlist?list=PLJ49NV7...,,
7,8,1457858330,load,https://www.youtube.com/user/rhettandlink2,,
8,9,1457858335,click,https://www.youtube.com/user/rhettandlink2,https://www.youtube.com/watch?v=Vuk2VXWNvxA,
9,10,1457859019,click,https://www.youtube.com/watch?v=Vuk2VXWNvxA,https://www.youtube.com/channel/UC4PooiX37Pld1...,


In [76]:
df_load = df[df[1] == "load"]
df_load

Unnamed: 0,index,0,1,2,3,4
0,1,1457857876,load,https://www.google.be/?gws_rd=ssl,,
2,3,1457857884,load,http://postsecret.com/,,
3,4,1457858230,load,https://www.youtube.com/,,
6,7,1457858320,load,https://www.youtube.com/playlist?list=PLJ49NV7...,,
7,8,1457858330,load,https://www.youtube.com/user/rhettandlink2,,
10,11,1457859050,load,https://www.google.be/search?q=wait+but+why&ie...,,
12,13,1457859054,load,http://waitbutwhy.com/,,
14,15,1457859104,load,http://waitbutwhy.com/2015/11/the-cook-and-the...,,
15,16,1457859440,load,http://www.tijd.be/,,
17,18,1457859452,load,http://www.tijd.be/marktenlive,,


In [52]:
domains

{'1dayfly.com': 4,
 'aftleuven.be': 2,
 'cosic.esat.kuleuven.be': 4,
 'cygnus.cc.kuleuven.be': 2,
 'f-secure.com': 1,
 'google.be': 11,
 'ibood.com': 2,
 'idp.kuleuven.be': 13,
 'ipython.org': 1,
 'jupyter.org': 1,
 'kuleuven.be': 3,
 'labsblog.f-secure.com': 2,
 'linkedin.com': 6,
 'newsfromthelab.files.wordpress.com': 1,
 'noisli.com': 1,
 'onclickads.net': 1,
 'p.cygnus.cc.kuleuven.be': 25,
 'people.cs.kuleuven.be': 1,
 'postsecret.com': 2,
 'putlocker.is': 3,
 'standaard.be': 12,
 'tijd.be': 9,
 'toledo.kuleuven.be': 14,
 'tweakers.net': 1,
 'waitbutwhy.com': 2,
 'wet.kuleuven.be': 3,
 'wiki.associatie.kuleuven.be': 3,
 'youtube.com': 8}

In [None]:
# Text file parser
# Returns a dictionary with imageid - text in lowercase without stopwords or punctuation
def text_file_parser(filename, stopwords, model):
    corpus = dict()
    #corpus = pd.DataFrame(columns=('id', 'imageid', 'vec'))
    with open(filename) as f:
        for doc in f:
            # Split on spaces
            doc_parts = doc.split(" ", 1)
            # If first part is the ID (needed for the queries file)
            if(len(doc_parts[0]) < 6):
                doc_parts = doc.split(" ", 2)
                doc_parts.pop(0)
            # Clean the caption text (remove puctuation etc)
            doc_parts[1] = clean_input(doc_parts[1], stopwords)
            # Add the vector of the caption to the array
            doc_parts.append(sentence_to_vector(model, doc_parts[1]))
            # add the array (or row) to an array
            corpus[len(corpus) + 1] = doc_parts

    # Transform to dataframe
    df = pd.DataFrame.from_dict(corpus, orient='index')
    df = df.reset_index()
    df.columns = ['index', 'img_id', 'caption', 'vec']
    return df

In [12]:
frame = pd.DataFrame.from_csv("data/march_13.csv")

CParserError: Error tokenizing data. C error: Expected 4 fields in line 3, saw 5


In [6]:
list_ = []

for file in os.listdir(path):
    iterrows = iter(open(path + "/"+file))
    for row in iterrows:
        filename = path + "/"+file
        print(filename)
        frame = pd.read_csv(filename,index_col=None, header=0)
        list_.append(frame)
df = pd.concat(list_)

# preview
df

./data/march_13.csv


CParserError: Error tokenizing data. C error: Expected 4 fields in line 3, saw 5
