In [None]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")
from datetime import date, datetime, timedelta

import altair as alt
# for the notebook only (not for JupyterLab) run this command once per session
alt.renderers.enable('notebook')
alt.data_transformers.enable('default', max_rows=None)

In [None]:
df = pd.read_csv('./data/VietnamConflict.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# NOTE: ALTAIR ONLY WORKS IF COLUMN CONVERTED TO DATETIME WITH PD.TO_DATETIME,
df.FATALITY_DATE = pd.to_datetime(df.FATALITY_DATE, format='%Y%m%d', errors='ignore')

In [None]:
df1 = df[df['BRANCH'].isin(['ARMY', 'MARINE CORPS'])]

infantry_MOS = ['INFANTRY OPERATIONS AND INTELLIGENCE SPECIALIST', 'INDIRECT FIRE INFANTRYMAN', 'INFANTRY UNIT LEADER', 'PARACHUTIST, INFANTRY UNIT COMMANDER', 'INFANTRYMAN', 'RIFLEMAN', 'MACHINEGUNNER', 'INFANTRY OFFICER (I)', 'ASSAULTMAN', 'HEAVY ANTI-ARMOR WEAPONS INFANTRYMAN', 'MORTARMAN', 'INFANTRY UNIT COMMANDER', 'BASIC INFANTRY OFFICER', 'RANGER, OPERATIONS AND TRAINING STAFF OFFICER (G3,A3,S3)', 'INFANTRY SENIOR SERGEANT', 'BASIC INFANTRYMAN', 'RANGER, UNIT OFFICER, TRAINING CENTER', 'RANGER, INFANTRY UNIT COMMANDER', 'RANGER', 'INFANTRY UNIT COMMANDER, (MECHANIZED)', 'LAV ASSAULTMAN', 'SCOUT-SNIPER']

# Assign category to soldier
df1['category'] = np.where(df1['POSITION'].isin(infantry_MOS), 'infantry', 'non-infantry')

In [None]:
def plot_yearly_deaths(df, category, sort_order, text_align, dx, title):

    # Data Transformation
    years = pd.DataFrame(list(range(1960, 1976, 1))).rename(columns={0:'FATALITY_YEAR'})
    df = df[df['category'] == category]
    df['count'] = 1
    df = pd.merge(years, df, on='FATALITY_YEAR', how='left')
    df['count'].fillna(0, inplace=True)
    return df
    # Create bar chart of fatality count vs fatality year
    chart = alt.Chart(df).mark_bar().encode(
                y=alt.Y('FATALITY_YEAR:O', axis=None, title=""),
                x=alt.X('sum(count)', sort=alt.SortOrder(sort_order), scale=alt.Scale(domain=[0, 11000]), title='Count of deaths')
            ).properties(width=400, height=300, title=title)

    # Create text labels for each bar
    text = chart.mark_text(
               align=text_align,
               baseline='middle',
               dx=dx
           ).encode(
               text='sum(count)'
           )
    
    # Create shared 'spine' of range of years
    middle = alt.Chart(df.drop_duplicates('FATALITY_YEAR', keep='first')).encode(
                y=alt.Y('FATALITY_YEAR:O', axis=None),
                text=alt.Text('FATALITY_YEAR:O'),
            ).mark_text(
                align='center',
                baseline='middle',
                dx=0,
                dy=0
            ).properties(width=20, height=300)

    return chart, middle, text

In [None]:
plot_yearly_deaths(df1, 'infantry', 'descending', 'right', -3, 'Infantry (Army & Marine Corps)')

In [None]:
I, years_axisI, textI = plot_yearly_deaths(df1, 'infantry', 'descending', 'right', -3, 'Infantry (Army & Marine Corps)')
NI, years_axisNI, textNI = plot_yearly_deaths(df1, 'non-infantry', 'ascending', 'left', 3, 'Non-Infantry (Army & Marine Corps)')

YEARLY_DEATHS = alt.vconcat(I + textI | years_axisI | NI + textNI)\
                             .configure_axis(
                                grid=False
                            ).configure_view(
                                strokeOpacity=0
                            )

In [None]:
YEARLY_DEATHS

In [None]:
def plot_yearly_deaths_cat(df, category, hostility_cond, sort_order, text_align, dx, axis_title, axis, title):

    color_scale = alt.Scale(
                    domain=['H','NH'],
                    range=["#ff0092", "#1f9dc4"]
                )

    # Data Transformation
    years = pd.DataFrame(list(range(1960, 1976, 1))).rename(columns={0:'FATALITY_YEAR'})
    df = df[df['category'] == category]
    df = df[(df['HOSTILITY_CONDITIONS'] == hostility_cond)]
    df['count'] = 1
    df = pd.merge(years, df, on='FATALITY_YEAR', how='left')
    df['count'].fillna(0, inplace=True)
    
    chart = alt.Chart(df).mark_bar().encode(
                y=alt.Y('FATALITY_YEAR:O', axis=None, title=''),
                x=alt.X('sum(count)', axis=axis, sort=alt.SortOrder(sort_order), scale=alt.Scale(domain=[0, 11000]), title=axis_title),
                color=alt.Color(
                    'HOSTILITY_CONDITIONS',
                    legend=alt.Legend(title='Hostility Conditions'),
                    scale=color_scale
                )
            ).properties(width=400, height=300, title=title)

    text = chart.mark_text(
               align=text_align,
               baseline='middle',
               dx=dx
           ).encode(
               text='sum(count)'
           )
    
    return chart, text

In [None]:
I_H, text_I_H = plot_yearly_deaths_cat(df1, 'infantry', 'H', 'descending', 'right', -3, '', None, 'Infantry (Army & Marine Corps)')
I_NH, text_I_NH = plot_yearly_deaths_cat(df1, 'infantry', 'NH', 'descending', 'right', -3, '', alt.Axis(), '')

NI_H, text_NI_H = plot_yearly_deaths_cat(df1, 'non-infantry', 'H', 'ascending', 'left', 3, '', None, 'Non-Infantry (Army & Marine Corps)')
NI_NH, text_NI_NH = plot_yearly_deaths_cat(df1, 'non-infantry', 'NH', 'ascending', 'left', 3, '', alt.Axis(), '')

MONTHLY_MSK_CAT = alt.vconcat(I_H + text_I_H | years_axisI | NI_H + text_NI_H, \
                              I_NH + text_I_NH | years_axisI | NI_NH + text_NI_NH)\
                 .configure_axis(
                    grid=False
                ).configure_view(
                    strokeOpacity=0
                )

In [None]:
def plot_fatality_reason(df, category, sort_order, text_align, dx, title):

    # Create a list to rank fatality reason in descending order
    sortby = pd.DataFrame(df.groupby('FATALITY_2').count()).reset_index().sort_values('SERVICE_TYPE', ascending=False)
    sortby_list = list(sortby['FATALITY_2'].values)
    sortby_list.remove('UNKNOWN')
    sortby_list.extend(['UNKNOWN'])

    # Data Transformation
    recategorize = {'HELICOPTER CRASH -- PASSENGER (MC)': 'HELICOPTER CRASH',\
                'HELICOPTER CRASH -- CREW (MC)': 'HELICOPTER CRASH', \
                'AIRCRAFT CRASH -- PASSENGER (MC)': 'AIRCRAFT CRASH', \
                'AIRCRAFT CRASH -- CREW (MC)': 'AIRCRAFT CRASH',\
                'SHIP/SUBMARINE ACCIDENT AT SEA (MC)': 'ACCIDENT AT SEA', \
                'VEHICLE CRASH (MC)': 'VEHICLE CRASH',\
                'HEART RELATED': 'HEALTH RELATED', \
                'CANCER': 'HEALTH RELATED', \
                'STROKE': 'HEALTH RELATED',\
                'GUNSHOT': 'OTHER WEAPONS', \
                'BLUNT FORCE': 'OTHER WEAPONS', \
                'LAND MINE': 'OTHER WEAPONS',\
                'MISSILE': 'OTHER WEAPONS'}

    df.replace({"FATALITY_2": recategorize}, inplace=True)
    df.FATALITY_2.fillna('UNKNOWN', inplace=True)
    fatality_reason = pd.DataFrame(df['FATALITY_2'].unique()).rename(columns={0:'FATALITY_2'})
        
    df = df[df['category'] == category]
    df['count'] = 1
    
    df = pd.merge(fatality_reason, df, on='FATALITY_2', how='left')
    df['count'].fillna(0, inplace=True)
    
    # Create bar chart of fatality count vs fatality reason
    chart = alt.Chart(df).mark_bar().encode(
                y=alt.Y('FATALITY_2:O', title="", axis=None, scale=alt.Scale(domain=sortby_list)),
                x=alt.X('sum(count)', sort=alt.SortOrder(sort_order), scale=alt.Scale(domain=[0, 15000]), title='Count of deaths')
            ).properties(width=400, height=400, title=title)

    # Create text labels for each bar
    text = chart.mark_text(
               align=text_align,
               baseline='middle',
               dx=dx
           ).encode(
               text='sum(count)'
           )
    
    # Create shared 'spine' of ranked fatality reasons 
    middle = alt.Chart(fatality_reason).encode(
                y=alt.Y('FATALITY_2:O', axis=None, scale=alt.Scale(domain=sortby_list)),
             ).mark_text(
                align='center',
                baseline='middle',
                dx=0,
                dy=0
             ).encode(
                text='FATALITY_2:O'
             ).properties(width=20, height=400)

    return chart, middle, text

In [None]:
I2, years_axisI2, textI2 = plot_fatality_reason(df1, 'infantry', 'descending', 'right', -3, 'Infantry (Army & Marine Corps)')
NI2, years_axisNI2, textNI2 = plot_fatality_reason(df1, 'non-infantry', 'ascending', 'left', 3, 'Non-Infantry (Army & Marine Corps)')

FATALITY_REASON = alt.hconcat(I2 + textI2 | years_axisI2 | NI2 + textNI2)\
                     .configure_axis(
                        grid=False
                    ).configure_view(
                        strokeOpacity=0
                    )

In [None]:
# Rename the provinces 
recategorize = {'Thua Thien': 'Thua Thien - Hue',\
                'Kontum': 'Kon Tum',\
                'Darlac': 'Dak Lak|Dac Lac',\
                'Gia Dinh': 'Ho Chi Minh City|Ho Chi Minh',\
                'Phuoc Tuy': 'Ba Ria - VTau|Ba Ria-Vung Tau',\
                'Pleiku': 'Gia Lai',\
                'Bien Hoa': 'Dong Nai',\
                'Quang Tin': 'Quang Nam'}


df.replace({"DEPLOYMENT_PROVINCE": recategorize}, inplace=True)
df['DEPLOYMENT_PROVINCE'] = df['DEPLOYMENT_PROVINCE'].str.title()

death_perc = pd.DataFrame(df.groupby('DEPLOYMENT_PROVINCE')['FATALITY_DATE'].count().sort_values(ascending=False) / len(df)).reset_index()
death_perc.rename(columns={'FATALITY_DATE': 'percentage'}, inplace=True)
death_perc_trunc = list(death_perc[0:10].DEPLOYMENT_PROVINCE)

In [None]:
death_perc_trunc

In [None]:
death_perc_trunc_sorted = ['Quang Tri', 'Thua Thien - Hue', 'Quang Nam', 'Quang Ngai',\
                           'Kon Tum', 'Binh Dinh', 'Binh Duong', 'Tay Ninh']

In [None]:
def plot_monthly_deaths(df, sortby_list):
    
    
    year_months = pd.DataFrame(pd.date_range(start='1965 March', end='1973 March', freq='MS').strftime("%Y %b").tolist())\
                .rename(columns={0: 'FATALITY_DATE'})

    misaligned_dates = df[(df['FATALITY_YEAR'] - df['FATALITY_DATE'].dt.strftime("%Y").astype(int)) != 0]
    df = df[~df.index.isin(misaligned_dates.index)]
    
    df['DEPLOYMENT_PROVINCE_1'] = np.where(df['DEPLOYMENT_PROVINCE'].isin(sortby_list), df['DEPLOYMENT_PROVINCE'], 'Other provinces')
    df = pd.DataFrame(df.groupby([df['FATALITY_DATE'].dt.strftime("%Y %b"), "DEPLOYMENT_PROVINCE_1"])['FATALITY_YEAR']\
                          .count().sort_values()).reset_index().rename(columns={'FATALITY_YEAR': 'count'})
    
    df = pd.merge(year_months, df, on='FATALITY_DATE', how='left')
    df.FATALITY_DATE = pd.to_datetime(df.FATALITY_DATE, format='%Y %b')
    df['count'].fillna(0, inplace=True)
  
    df.to_csv("./data/deathPercentages_byProvince_byTime.csv", index=False)
    df.to_json('./data/deathPercentages_byProvince_byTime.json', orient='records')
    
    sortby_list.extend(['Other provinces'])
    color_scale = alt.Scale(
                domain=sortby_list,
                range=['#B2172A', '#E3191C','#FD8D3B', '#FED975', \
                       '#31A353', '#74C476', '#9ECAE0', '#2066AC', '#BDBDBD'] 
              )
        
    return alt.Chart(df).mark_bar(size=10).encode(
            x=alt.X('yearmonth(FATALITY_DATE):T', title=""),
            y=alt.Y('count:Q', sort=alt.SortOrder('ascending'), scale=alt.Scale(domain=[0, 3000]), title='Count of deaths'),
            color=alt.Color(
                'DEPLOYMENT_PROVINCE_1',
                scale=color_scale
            )
        ).properties(width=1200, height=250)    

In [None]:
MONTHLY_DEATHS = plot_monthly_deaths(df, death_perc_trunc_sorted)

In [None]:
df_timeline = pd.read_csv('./data/VietnamWarTimeline.csv')
df_timeline.to_json('./data/VietnamWarTimeline.json', orient='records')

In [None]:
def plot_timeline(df):
    
    color_scale = alt.Scale(
                domain=['Yes','No'],
                range=["#000000", "#ffffff"]
            )

    year_months = pd.DataFrame(pd.date_range(start='1965 March', end='1973 March', freq='MS').strftime("%Y %b").tolist())\
                .rename(columns={0: 'FATALITY_DATE'})
    
    df = df.rename(columns={'Date': 'FATALITY_DATE'})
    df['plot'] = 'Yes'
    df = pd.merge(year_months, df, on='FATALITY_DATE', how='left')
    df.FATALITY_DATE = pd.to_datetime(df.FATALITY_DATE, format='%Y %b')
    df['plot'] = df['plot'].fillna('No')
    df['y'] = 3000 
    df = df.sort_values('FATALITY_DATE', ascending=True)
    df['Event'] = df['Event'].fillna('')
    df.to_json('./data/VietnamWarTimeline.json', orient='records')
    
    chart = alt.Chart(df).mark_tick(dx=0, dy=0).encode(
        x=alt.X('yearmonth(FATALITY_DATE):T', axis=None, title=None),
        y=alt.Y('y:Q', axis=None, title=None),
        color=alt.Color(
            'plot',
            scale=color_scale,
            legend=None
        )
        ).properties(
            width=1200, height=250
        )
        
    text = alt.Chart(df.drop_duplicates('Event', keep='first')).mark_text(
        angle=90, dx=10, dy=0, baseline='middle', align='left',
    ).encode(
        x=alt.X('yearmonth(FATALITY_DATE):T', axis=None, title=None),
        y=alt.Y('y:Q', axis=None, title=None),
        text='Event:O'
    ).properties(
            width=1200, height=250
        )
    
    return chart + text

In [None]:
 timeline = plot_timeline(df_timeline)

In [None]:
stacked = alt.layer(MONTHLY_DEATHS, timeline).resolve_scale(color='independent', y='independent')

In [None]:
stacked

In [None]:
timeline

In [23]:
df_results = pd.read_csv('../results.csv')
df_results.to_json('../results.json', orient='records')

df_races = pd.read_csv('../races.csv')
df_races.to_json('../races.json', orient='records')