# Van's RV Mean and Median Flight Hours

## Statistics help from:
https://www.dummies.com/education/math/statistics/how-to-compare-two-population-proportions/

## Imports

In [4]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

plt.style.use('seaborn')

## Load aircraft data

In [5]:
all_df = pd.read_csv('../data/aircraft.csv')
exp_df = all_df.dropna(subset=['acft_model'])  # [['']]

## Load pilot data

In [6]:
pilot_df = pd.read_csv('../data/flight_time.csv')
pilot_df = pilot_df[pilot_df['flight_type'].str.lower().str.strip().str.contains(pat='totl')]
pilot_df = pilot_df[pilot_df['flight_craft'].str.lower().str.strip().str.contains(pat='all')]
pilot_df = pilot_df[pilot_df['crew_no'] == 1]
pilot_df = pilot_df[['ev_id', 'flight_hours']]

## Load findings data

In [7]:
findings_df = pd.read_csv('../data/Findings.csv')

## Join the RV and pilot dataframes

In [8]:
vans_df = exp_df[exp_df['acft_model'].str.lower().str.strip().str.contains(pat='rv')]
vans_df = vans_df.merge(pilot_df, on='ev_id')[['acft_model', 'flight_hours', 'ev_id']]

## Find the incidents per model

In [9]:
rv6 = vans_df[vans_df['acft_model'].str.contains(pat='6')]
rv6a = rv6[rv6['acft_model'].str.upper().str.contains(pat='6A')]
rv6 = rv6[~rv6['acft_model'].str.upper().str.contains(pat='6A')]

## Find incidents involving the matter of "control"

In [10]:
control_df = findings_df[findings_df['finding_description'].str.lower().str.strip().str.contains(pat='control')][['ev_id']]
control_df = control_df['ev_id']
control_df = control_df.drop_duplicates()

## Distinguish control events between models

In [11]:
rv6_control = rv6.merge(control_df, on='ev_id')[['acft_model', 'flight_hours', 'ev_id']]
rv6a_control = rv6a.merge(control_df, on='ev_id')[['acft_model', 'flight_hours', 'ev_id']]

## Null Hypothesis:
RV-6 pilot miscontrol rate is **equal** than the RV-6A pilot miscontrol rate.

## Alternative Hyptothesis:
RV-6 pilot miscontrol rate is **greater** than the RV-6A pilot miscontrol rate.

## Calculating the proportions

### rho_1 is the RV-6

In [12]:
p_1 = len(rv6_control.index)
n_1 = len(rv6.index)
rho_hat_1 = p_1 / n_1
rho_hat_1

0.22807017543859648

### rho_2 is the RV-6A

In [13]:
p_2 = len(rv6a_control.index)
n_2 = len(rv6a.index)
rho_hat_2 = p_2 / n_2
rho_hat_2

0.2283464566929134

## Caclulating the standard deviation

In [14]:
rho_hat = (p_1 + p_2) / (n_1 + n_2)
std_err = math.sqrt(rho_hat * (1 - rho_hat) * ((1 / n_1) + (1 / n_2)))
z = (p_1 - p_2) / std_err

In [15]:
z

203.4181373892845

## Reject the null hypothesis!