In [1]:
import csv
import glob
import os
import pandas as pd
import re
from  dateutil.parser import parse

pd.options.display.max_columns = None
pd.options.display.max_rows = 100

# Parse relevent parts of mobtool

There are at least 3 variations of the first field in the header rows:

    1) header row = `ID` <br>
    2) header row = `TR ID#`<br>
    3) header row = `Legal First Name`<br>
    
Additionally, there are variations in which row is the header. Since I can't trust the consistancy, I will read each line until we find one of the variations of header rows. Once we have that row number, we can import into a pandas dataframe and skiplines to the header.

In [91]:
def load_data(file):
    """
    Loads a CSV file into a dataframe and drops NaN columns
    input: Path to a csv file
    output: Dataframe
    """
    # Find the header row
    num_rows = find_header_row(file)
    df = pd.read_csv(file, skiprows=num_rows)
    # Drop columns where all values are NaN
    df = df.dropna(axis='columns', how='all')
    return df


def find_header_row(file):
    """
    Read each line in a CSV file and return a row number for the header
    input: Path to a csv file
    output: integer (the row number)
    """
    with open(file,'r') as infile:
        reader = csv.reader(infile, delimiter=',', quotechar='"')
        row_num = 0
        for row in reader:
            if (row[0] == 'ID') | (row[0] == 'TR ID#') | (row[0] == 'Legal First Name'):
                break
            else:
                row_num += 1
                next
    return(row_num)


def write_csv(file, df):
    """
    Writes a CSV file to the "cleaned" directory
    input: a cleaned dataframe (after load_data())
    output: csv file
    """
    path_cleaned = 'cleaned/'
    os.makedirs(path_cleaned, exist_ok=True)
    path_cleaned_file = path_cleaned+file
    df.to_csv(path_cleaned_file, index=None)
    
    
def parse_op_name(filename):
    """
    Parses an operation name from a mobtool's file name
    """
    filename_parts = re.split(', | - ', filename)
    for part in filename_parts:
        part = part.upper()
        # Strip superfluous text, punctuation, and characters
        if "op".upper() in part:
            op_name = part.replace("PERSONNEL PLANNING TOOL", "").replace("MOBILIZATION TOOL","")
            op_name = op_name.replace("_", "").replace("'","")
            if "operation".upper() not in op_name:
                op_name= op_name.replace("op".upper(), "Operation".upper())            
    return op_name


def standardize_dates(df):
    mapper={}
    for col in df.columns:
        try:
            dt = parse(str(col))
            new_date = dt.strftime('%Y-%m-%d')
            mapper[col] = new_date 
        except:
            next
    df.rename(mapper, axis=1, inplace=True)
    return df


def get_start_end_dates(series):
    """
    Gets the dates a volunteers first and last deployed. Also returns total days deployed.
    This does not take into account that volunteers can deploy multiple times on an op and 
    does not count travel days.
    
    input: a Pandas series (dataframe row) 
    output: tuple, first date, last date of deployment, and total days deployed
    
    This requires the dates to be standardized and numercial
    """
    
    df = series.to_frame().transpose()
    
    dropcols=[]
    for col in df.columns:
        try: 
            int(col[0])        
        except:
            dropcols.append(col)
    df.drop(columns=dropcols, inplace=True)
    df.dropna(axis=1, inplace=True)
    
    dates=[]
    for col in df.columns:
        dt = parse(str(col))
        dates.append(dt.strftime('%Y-%m-%d'))
    if len(dates) > 0:
        start = min(dates)
        end = max(dates)
        total_days_deployed = len(df.columns)
        return start, end, total_days_deployed
    else:
        return None, None, None
    
def merge_dataframes(dfs):
    final_columns = [
        'TR ID#', 'OP_NAME', 'START_DATE', 'END_DATE', 'TOTAL_DAYS',
        'First Name', 'Last Name', 'Legal First Name', 'Email', 
        'Phone #', 'Position', 'Contact Number (ex:143.143.1234)']
    final_df = pd.concat(dfs, axis=0, join="outer")
    final_df = final_df[final_columns]
    
    # Drop rows where these columns have Null values
    drop_columns = ['TR ID#','First Name', 'Last Name', 'Legal First Name', 'Email', 
        'Phone #', 'Position', 'Contact Number (ex:143.143.1234)']
    
    final_df.dropna(subset=drop_columns, how='all', inplace=True)
    return final_df

In [93]:
def main(input_dir):
    if "/" not in input_dir:
        input_dir = input_dir + "/"
    input_files = [f.strip("input/") for f in glob.glob(input_dir+'/*.csv')]
    
    dfs=[]
    for file in input_files:
        # Load and clean
        df = load_data(input_dir+file)
        
        for col in df.columns:
            if col not in all_columns:
                all_columns.append(col)
           
        # Parse Op name and add to the dataframe
        op_name = parse_op_name(file)
        df['OP_NAME'] = op_name
        print(op_name)
        
        # Standardize the format of all the dates
        df = standardize_dates(df)

        # Get the dates
        for row in range(len(df)):
            series = df.iloc[row]
            start, end, total_days_deployed = get_start_end_dates(series)
            df.at[row,'START_DATE'] = str(start)
            df.at[row,'END_DATE'] = str(end)
            df.at[row,'TOTAL_DAYS'] = total_days_deployed
        
        # Add dataframes to a list
        dfs.append(df)        
        
        # Write to file
        write_csv(file, df)
    
    # Concat all dataframes
    final_df = merge_dataframes(dfs)
    
    # Write full dataset to file
    write_csv('final_df.csv', final_df)
        
    print("Done.")
    return dfs,all_columns,final_df

dfs,all_columns,final_df = main('input/')

OPERATION SKIPPING CHRISTMAS
OPERATION CRAZY TRAIN
OPERATION COAL MINERS DAUGHTER
OPERATION DOUBLE TROUBLE
OPERATION OLD PUT
OPERATION TWISTED TRUNK 
OPERATION HUCKLEBERRY HUSTLE
OPERATION PALMETTO PUNCH 
OPERATION RIGHT STUFF
OPERATION BARBED WIRE
OPERATION OLD ANCHOR
OPERATION BIG DIG
OPERATION BREDO RISING 
OPERATION SLEEPING BEAR
Done.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [90]:
merge_dataframes(dfs)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  from ipykernel import kernelapp as app


Unnamed: 0,TR ID#,OP_NAME,START_DATE,END_DATE,TOTAL_DAYS,First Name,Last Name,Legal First Name,Email,Phone #,Position,Contact Number (ex:143.143.1234)
0,,OPERATION SKIPPING CHRISTMAS,,,,Robert,Obernier,,obernier@teamrubiconusa.org,850-443-1060,IMT,
1,TR0013518,OPERATION SKIPPING CHRISTMAS,2015-12-26,2015-12-29,4.0,Lloyd,Weema,,llw5@case.edu,704.778.0399,VOL,
2,TR0010619,OPERATION SKIPPING CHRISTMAS,2015-12-25,2016-01-01,8.0,James,Threadgill,,jamesthreadgill@gmail.com,901-581-2889,PSC,
3,TR0001322,OPERATION SKIPPING CHRISTMAS,2015-12-25,2016-01-01,8.0,Conrad,McCloskey,,conrad.mccloskey@teamrubicon.org,907.244.4952,Sawyer,
4,TR0016326,OPERATION SKIPPING CHRISTMAS,2015-12-25,2015-12-28,4.0,Jacob,Nilz,,j_nilz@yahoo.com,501.944.3784,VOL,
5,TR0011539,OPERATION SKIPPING CHRISTMAS,2015-12-26,2016-01-01,7.0,Chad,Hargon,,chad.hargon@teamrubiconusa.org,318.732.7319,VOL,
6,TR500188,OPERATION SKIPPING CHRISTMAS,2015-12-29,2016-01-01,4.0,Matthew,Watson,,Mattheww7@gmail.com,254-338-6706,,
7,TR402433,OPERATION SKIPPING CHRISTMAS,2015-12-25,2016-01-01,8.0,James,Price,,Jimprice1@charter.net,256-625-0086,VOL,
8,TR0005126,OPERATION SKIPPING CHRISTMAS,2015-12-26,2016-01-01,7.0,James,Laman,,james.laman@teamrubiconusa.org,864.344.2985,LSC,
9,TR400240,OPERATION SKIPPING CHRISTMAS,2015-12-26,2015-12-31,6.0,Travis,Tanguay,,travis_tanguay@yahoo.com,803.760.6424,VOL,
