In [23]:
import pandas as pd
import json
import glob
from itertools import groupby
from datetime import datetime
from IPython.core.debugger import set_trace

In [277]:
def load_all_json(limit_file=150):
    
    return [entry for d in [glob.glob("data/original_run/*.json"), glob.glob("data/second_set/*.json")]for filename in d for entry in json.load(open(filename))]
     

In [278]:
RELEVANT_FIELDS = ["start_date","end_date","name","company","description","industry"]

def prepare_entry(entry):
    if entry["start_date"] and entry["end_date"]: 
        try:
            entry["start_date"] =datetime.strptime(entry["start_date"],'%Y-%m-%d').date()
            entry["end_date"] = datetime.strptime(entry["end_date"],'%Y-%m-%d').date()
            return {k:v for k,v in entry.iteritems() if k in RELEVANT_FIELDS}
        except Exception as e:
            print(e)
            return None
    else:
        return None

def prepare_data(entries):
    return [entry for entry in entries if prepare_entry(entry)]
        
def group_by_name(entries):
    entries=sorted(data, key=lambda x:x["name"])
    return groupby(entries,lambda x:x["name"])

def group_job_by_industry(jobs):
    
    sorted_jobs = sorted(jobs)
    jobs_in_industry = []
    curr_industry = [sorted_jobs[0]]
    for job in sorted_jobs[1:]:
        if job["industry"] == curr_industry[0]["industry"]:
            curr_industry.append(job)
        else:
            jobs_in_industry.append([curr_industry[0]["industry"],curr_industry])
            curr_industry =[job]
    
    jobs_in_industry.append( [curr_industry[0]["industry"],curr_industry])
    
    return jobs_in_industry

def add_job_durations(person):
    res = []
    for job in person:
        try:   
            duration = (job["end_date"] - job["start_date"]).days
        except:
            duration = -1
            
        job["duration"] = duration
    
    return person

def normalize_by_macrocategory(grouped_jobs):
    intel_industries = ["Military","Security and Investigations","Information Technology and Services",
                        "Computer & Network Security","Defense & Space"
                       ]
    
    def transform(industry):
        if industry in intel_industries:
            return "intel"
        else:
            return industry
    def transform_job(job):
        job["industry"] = transform(job["industry"])
        return job
    
    return [(transform(industry), [transform_job(job) for job in job_list])
     for industry,job_list in grouped_jobs]
        
        

def count_industry_flickering(grouped_jobs):
    count = 0
    normalized_jobs = normalize_by_macrocategory(grouped_jobs)
    for i in range(1,len(normalized_jobs)-1):
        prev_industry, prev_job_list = normalized_jobs[i-1]
        curr_industry, curr_job_list = normalized_jobs[i]
        next_industry, next_job_list = normalized_jobs[i+1]
                
        if prev_industry == next_industry and\
        curr_industry != prev_industry and\
        sum([job["duration"] for job in curr_job_list]) >0:
            
          
          count = count+1
    return count
              
def visualize_career(person):
    print("Name: ",person["name"])
    print("Industry changes: ",person["industry_changes"])
    formatted_jobs = ["|".join([job["industry"],job["company"],str(job["duration"])]) for job_list in person["grouped_jobs"] for job in job_list]
    print("\t\n".join(formatted_jobs))
    print("-------------------_")
        
    

def process_persons(persons):
    
    grouped_jobs_all = {}
    for person in persons:
        name, job_list = person
        job_list = list(job_list)
        person = add_job_durations(job_list)
        grouped_jobs = group_job_by_industry(job_list)
        
        grouped_jobs_all[name] = {"name":name,
            "grouped_jobs":[job for industry, job in grouped_jobs],
                                  "industry_changes": count_industry_flickering(grouped_jobs)
                                 }
        
    
        
    return grouped_jobs_all
    
    

In [279]:
data= load_all_json()


In [280]:
prepared_data = prepare_data(data)

time data '20770-03-01' does not match format '%Y-%m-%d'
time data '-1995-06-08' does not match format '%Y-%m-%d'
time data '-2012-05-01' does not match format '%Y-%m-%d'
time data '-2011-08-03' does not match format '%Y-%m-%d'
time data '-2012-05-01' does not match format '%Y-%m-%d'
time data '-2011-08-03' does not match format '%Y-%m-%d'
time data '21117-03-24' does not match format '%Y-%m-%d'
time data '-1997-11-04' does not match format '%Y-%m-%d'
time data '-2010-01-07' does not match format '%Y-%m-%d'
time data '21117-03-24' does not match format '%Y-%m-%d'
time data '-1997-11-04' does not match format '%Y-%m-%d'
time data '20877-03-01' does not match format '%Y-%m-%d'
time data '-0006-08-04' does not match format '%Y-%m-%d'
time data '20877-03-01' does not match format '%Y-%m-%d'
time data '-2010-09-01' does not match format '%Y-%m-%d'
time data '-1997-11-04' does not match format '%Y-%m-%d'
time data '20770-03-01' does not match format '%Y-%m-%d'
time data '21117-03-24' does no

In [281]:
grouped_data = group_by_name(prepare_data)

In [None]:
processed_persons=process_persons(grouped_data)

In [None]:
top_per_change=sorted(processed_persons.items(), key = lambda x: x[1]["industry_changes"])[:10:-1]

In [None]:
[visualize_career(x[1]) for x in top_per_change]