In [14]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from scipy import stats
import matplotlib.pyplot as plt

rng = np.random.default_rng(1)

In [15]:
df = pd.read_excel("Unit_three_takehome_data.xlsx")

Part A

In [16]:
arr = np.arange(1000)
rng.shuffle(arr)

a = arr[:500]
b = arr[500:]

adf = df.iloc[a].reset_index(drop = True)[["A improvement"]]
bdf = df.iloc[b].reset_index(drop = True)[["B improvement"]]

In [17]:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=adf["A improvement"], name = "Test Group A"), row=1, col=1)
fig.add_trace(go.Histogram(x=bdf["B improvement"], name = "Test Group B"), row=1, col=2)

fig.update_xaxes(title="Drug A Improvements", row=1, col=1)
fig.update_xaxes(title="Drug B Improvements", row=1, col=2)

fig.update_layout(title_text="Histogram Comparison of Cholestrol Improvement with Drugs A and B")

fig.show()

In [18]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Box(x = adf["A improvement"], name = "Test Group A"), row=1, col=1)
fig.add_trace(go.Box(x = bdf["B improvement"], name = "Test Group B"), row=1, col=1)
fig.update_xaxes(title = "Cholestrol Improvement")
fig.update_layout(title = "Boxplot Comparison of Cholestrol Improvement with Drugs A and B")
fig.show()

In [19]:
print("Group A:")
print("Minimum: " + str(min(adf["A improvement"])))
print("Q1: " + str(np.quantile(adf["A improvement"], 0.25)))
print("Median: " + str(np.median(adf["A improvement"])))
print("Q3: " + str(np.quantile(adf["A improvement"], 0.75)))
print("Maximum: " + str(max(adf["A improvement"])))
print("IQR: " + str(np.quantile(adf["A improvement"], 0.75) - np.quantile(adf["A improvement"], 0.25)))
print()
print("Mean: " + str(round(np.mean(adf["A improvement"]),2)))
print("Standard Deviation: " + str(round(np.std(adf["A improvement"]),2)))
print("Variance: " + str(round(np.std(adf["A improvement"])**2,2)))
print("\n\n")

print("Group B:")
print("Minimum: " + str(min(bdf["B improvement"])))
print("Q1: " + str(np.quantile(bdf["B improvement"], 0.25)))
print("Median: " + str(np.median(bdf["B improvement"])))
print("Q3: " + str(np.quantile(bdf["B improvement"], 0.75)))
print("Maximum: " + str(max(bdf["B improvement"])))
print("IQR: " + str(np.quantile(bdf["B improvement"], 0.75) - np.quantile(bdf["B improvement"], 0.25)))
print()
print("Mean: " + str(round(np.mean(bdf["B improvement"]),2)))
print("Standard Deviation: " + str(round(np.std(bdf["B improvement"]),2)))
print("Variance: " + str(round(np.std(bdf["B improvement"])**2,2)))

Group A:
Minimum: -1.6249241394593525
Q1: 4.408448566934062
Median: 6.7455562487558325
Q3: 10.160069841006155
Maximum: 21.867855727877725
IQR: 5.751621274072093

Mean: 7.37
Standard Deviation: 4.1
Variance: 16.78



Group B:
Minimum: -3.7498423382376984
Q1: 4.427593002129434
Median: 7.07243069670217
Q3: 9.98439529585675
Maximum: 20.535538779810736
IQR: 5.556802293727316

Mean: 7.48
Standard Deviation: 4.19
Variance: 17.53


Part B

Female

In [20]:
fdf = df[df["Gender"] == "Female"].reset_index(drop = True)

farr = np.arange(len(fdf))
rng.shuffle(farr)

fa = farr[:int(len(fdf)//2)]
fb = farr[int(len(fdf)//2):]

fadf = fdf.iloc[fa].reset_index(drop = True)[["A improvement"]]
fbdf = fdf.iloc[fb].reset_index(drop = True)[["B improvement"]]

In [21]:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=fadf["A improvement"], name = "Test Group A"), row=1, col=1)
fig.add_trace(go.Histogram(x=fbdf["B improvement"], name = "Test Group B"), row=1, col=2)

fig.update_xaxes(title="Drug A Improvements", row=1, col=1)
fig.update_xaxes(title="Drug B Improvements", row=1, col=2)

fig.update_layout(title_text="Histogram Comparison of Cholestrol Improvement with Drugs A and B")

fig.show()

In [22]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Box(x = fadf["A improvement"], name = "Test Group A"), row=1, col=1)
fig.add_trace(go.Box(x = fbdf["B improvement"], name = "Test Group B"), row=1, col=1)
fig.update_xaxes(title = "Cholestrol Improvement")
fig.update_layout(title = "Boxplot Comparison of Cholestrol Improvement with Drugs A and B")
fig.show()

In [23]:
print("Group A:")
print("Minimum: " + str(min(fadf["A improvement"])))
print("Q1: " + str(np.quantile(fadf["A improvement"], 0.25)))
print("Median: " + str(np.median(fadf["A improvement"])))
print("Q3: " + str(np.quantile(fadf["A improvement"], 0.75)))
print("Maximum: " + str(max(fadf["A improvement"])))
print("IQR: " + str(np.quantile(fadf["A improvement"], 0.75) - np.quantile(fadf["A improvement"], 0.25)))
print()
print("Mean: " + str(round(np.mean(fadf["A improvement"]),2)))
print("Standard Deviation: " + str(round(np.std(fadf["A improvement"]),2)))
print("Variance: " + str(round(np.std(fadf["A improvement"])**2,2)))
print("\n\n")

print("Group B:")
print("Minimum: " + str(min(fbdf["B improvement"])))
print("Q1: " + str(np.quantile(fbdf["B improvement"], 0.25)))
print("Median: " + str(np.median(fbdf["B improvement"])))
print("Q3: " + str(np.quantile(fbdf["B improvement"], 0.75)))
print("Maximum: " + str(max(fbdf["B improvement"])))
print("IQR: " + str(np.quantile(fbdf["B improvement"], 0.75) - np.quantile(fbdf["B improvement"], 0.25)))
print()
print("Mean: " + str(round(np.mean(fbdf["B improvement"]),2)))
print("Standard Deviation: " + str(round(np.std(fbdf["B improvement"]),2)))
print("Variance: " + str(round(np.std(fbdf["B improvement"])**2,2)))

Group A:
Minimum: -2.5773483516058207
Q1: 3.033267173944118
Median: 4.878781854425796
Q3: 6.517208576415108
Maximum: 12.433705432895607
IQR: 3.4839414024709896

Mean: 4.92
Standard Deviation: 2.59
Variance: 6.69



Group B:
Minimum: -2.605114919477842
Q1: 7.257268080014654
Median: 10.220852900297459
Q3: 13.211960257628192
Maximum: 19.955063769242827
IQR: 5.954692177613538

Mean: 10.26
Standard Deviation: 4.22
Variance: 17.83


In [24]:
mdf = df[df["Gender"] == "Male"].reset_index(drop = True)

marr = np.arange(len(mdf))
rng.shuffle(marr)

ma = marr[:int(len(mdf)//2)]
mb = marr[int(len(mdf)//2):]

madf = fdf.iloc[ma].reset_index(drop = True)[["A improvement"]]
mbdf = fdf.iloc[mb].reset_index(drop = True)[["B improvement"]]

In [25]:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=madf["A improvement"], name = "Test Group A"), row=1, col=1)
fig.add_trace(go.Histogram(x=mbdf["B improvement"], name = "Test Group B"), row=1, col=2)

fig.update_xaxes(title="Drug A Improvements", row=1, col=1)
fig.update_xaxes(title="Drug B Improvements", row=1, col=2)

fig.update_layout(title_text="Histogram Comparison of Cholestrol Improvement with Drugs A and B")

fig.show()

In [26]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Box(x = madf["A improvement"], name = "Test Group A"), row=1, col=1)
fig.add_trace(go.Box(x = mbdf["B improvement"], name = "Test Group B"), row=1, col=1)
fig.update_xaxes(title = "Cholestrol Improvement")
fig.update_layout(title = "Boxplot Comparison of Cholestrol Improvement with Drugs A and B")
fig.show()

In [27]:
print("Group A:")
print("Minimum: " + str(min(madf["A improvement"])))
print("Q1: " + str(np.quantile(madf["A improvement"], 0.25)))
print("Median: " + str(np.median(madf["A improvement"])))
print("Q3: " + str(np.quantile(madf["A improvement"], 0.75)))
print("Maximum: " + str(max(madf["A improvement"])))
print("IQR: " + str(np.quantile(madf["A improvement"], 0.75) - np.quantile(madf["A improvement"], 0.25)))
print()
print("Mean: " + str(round(np.mean(madf["A improvement"]),2)))
print("Standard Deviation: " + str(round(np.std(madf["A improvement"]),2)))
print("Variance: " + str(round(np.std(madf["A improvement"])**2,2)))
print("\n\n")

print("Group B:")
print("Minimum: " + str(min(mbdf["B improvement"])))
print("Q1: " + str(np.quantile(mbdf["B improvement"], 0.25)))
print("Median: " + str(np.median(mbdf["B improvement"])))
print("Q3: " + str(np.quantile(mbdf["B improvement"], 0.75)))
print("Maximum: " + str(max(mbdf["B improvement"])))
print("IQR: " + str(np.quantile(mbdf["B improvement"], 0.75) - np.quantile(mbdf["B improvement"], 0.25)))
print()
print("Mean: " + str(round(np.mean(mbdf["B improvement"]),2)))
print("Standard Deviation: " + str(round(np.std(mbdf["B improvement"]),2)))
print("Variance: " + str(round(np.std(mbdf["B improvement"])**2,2)))

Group A:
Minimum: -2.5773483516058207
Q1: 3.3346605039934616
Median: 5.1079498755343025
Q3: 6.60849519554889
Maximum: 12.321611645889362
IQR: 3.273834691555429

Mean: 5.0
Standard Deviation: 2.36
Variance: 5.58



Group B:
Minimum: -2.605114919477842
Q1: 7.466144450665297
Median: 9.992595711220545
Q3: 12.686172934930653
Maximum: 19.535436302974887
IQR: 5.220028484265356

Mean: 10.06
Standard Deviation: 4.04
Variance: 16.36


In [28]:
rng2 = np.random.default_rng(2)
arr = np.arange(1000)
rng2.shuffle(arr)

a = arr[:500]
b = arr[500:]

adfd = df.iloc[a].reset_index(drop = True)[["A improvement"]]
bdfd = df.iloc[b].reset_index(drop = True)[["B improvement"]]

adfd.to_excel("1_a.xlsx")
bdfd.to_excel("1_b.xlsx")

fdfd = df[df["Gender"] == "Female"].reset_index(drop = True)

farrd = np.arange(len(fdfd))
rng2.shuffle(farrd)

fad = farrd[:int(len(fdfd)//2)]
fbd = farrd[int(len(fdfd)//2):]

fadfd = fdfd.iloc[fad].reset_index(drop = True)[["A improvement"]]
fbdfd = fdfd.iloc[fbd].reset_index(drop = True)[["B improvement"]]

fadfd.to_excel("f_a.xlsx")
fbdfd.to_excel("f_b.xlsx")

mdfd = df[df["Gender"] == "Male"].reset_index(drop = True)

marrd = np.arange(len(mdfd))
rng2.shuffle(marrd)

mad = marrd[:int(len(mdfd)//2)]
mbd = marrd[int(len(mdfd)//2):]

madfd = fdfd.iloc[mad].reset_index(drop = True)[["A improvement"]]
mbdfd = fdfd.iloc[mbd].reset_index(drop = True)[["B improvement"]]

madfd.to_excel("m_a.xlsx")
mbdfd.to_excel("m_b.xlsx")