## Observations and Insights

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import matplotlib as mpl
import numpy as np

#------- Format the graphics 
mpl.rcParams['font.size'] = 14

#------- Format for the floats for this work
pd.options.display.float_format = '{:,.2f}'.format
# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset

dataset=mouse_metadata.merge(study_results, on='Mouse ID',suffixes=('_metadata','_result'))


display(dataset.head())

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen

stats = dataset.groupby('Drug Regimen').describe()
mean = dataset.groupby('Drug Regimen').mean()
median = dataset.groupby('Drug Regimen').median()
variance = dataset.groupby('Drug Regimen').var()
stdeviation = dataset.groupby('Drug Regimen').std()
sem = pd.DataFrame(dataset.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem())
#display(stats)

display(mean)
display(median)
display(variance)
display(stdeviation)
display(sem)

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
#dataset.plot.bar(x=)

regimen=dataset.groupby('Drug Regimen')['Drug Regimen'].count()
regimen.plot.bar(rot=45, color='#D8BFD8')

#------Formatting ------

plt.title('Mice used in each Drug Regimen')
plt.ylabel('Number of Mice')
plt.show()


In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
py_drug = pd.DataFrame(regimen)
py_drug.rename(columns={'Drug Regimen': "Number of Mice"}, inplace=True)
py_drug.reset_index(inplace=True)

plt.bar(py_drug['Drug Regimen'],py_drug['Number of Mice'], color='#87CEFA')

#------ Formatting------

plt.xticks(rotation=45)
plt.title('Mice used in each Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice')
plt.show()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
#Variables fot the pie chart
colors={'#99ff99','#ffcc99'}
explode=(0.05,0)

#------ Plotting
dataset['Sex'].value_counts().plot.pie(autopct='%1.1f%%',figsize=(5,5)
                                       ,colors=colors,startangle=80, 
                                       explode=explode,shadow=True)
#------ Formatting
plt.axis('off')
plt.title('Distribution of the Mice sex')
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
#------ Crreating a DF with the sex of the mice
gender=(pd.DataFrame(dataset['Sex'].value_counts())).reset_index()
#------ Creating the parameters for the pie chart

labels=gender['index']
sizes=gender['Sex']
colors={'#ff9999','#66b3ff'}
explode=(0.05,0)

#------ Plotting
plt.pie(sizes,labels=labels,autopct='%1.1f%%',colors=colors,startangle=80, 
                                       explode=explode,shadow=True)

#---- Formatting
plt.title('Distribution of the Mice sex')
plt.show()


## Quartiles, outliers and boxplots

In order to know the final tumor volumen for each mouse for the 4 most promising treatments, is mandatory to know which this proimising treatments are:

--> It is necessary to know the total volumen for each Drug Regimen
--> When we know our total volume, it it calculated the mean and standard deviation
--> With this values calculated a range is created where the most promising traetments are located,
    after that the data is sorted (ascending).
    
   

In [None]:
finalvol = (pd.DataFrame(dataset.groupby('Drug Regimen')['Tumor Volume (mm3)'].sum())
           ).reset_index()
#----- mean
meanfinalvol = finalvol['Tumor Volume (mm3)'].mean()
#------ standard deviation
stdfinalvol = finalvol['Tumor Volume (mm3)'].std()
#------ limit inferior
st1 = meanfinalvol+stdfinalvol
#------ limit superior
st2 = meanfinalvol-stdfinalvol
#------ filtering data
promising = (finalvol.loc[(finalvol['Tumor Volume (mm3)'] >st2) &
                          (finalvol['Tumor Volume (mm3)'] <st1)]).copy()
#----- setting index wit the tumor volumne and sorting it
promising.set_index('Drug Regimen',inplace=True)
promising.sort_values('Tumor Volume (mm3)', inplace = True)

display(promising)

With the 4 most promising treatments known, the volume for each mouse used in this treatment is calculated.

In [None]:
#------ Selecting the four most promising values
four = promising.reset_index().iloc[0:4,0].values.tolist()

#------ 
mice=pd.DataFrame(dataset.groupby(['Drug Regimen','Mouse ID'])['Tumor Volume (mm3)'].sum())
total_mouse = mice[mice.index.get_level_values(0).isin(four)]


display(total_mouse)


### Determine the quartiles and IQR and quantitatively and if there are any potential outliers across all four treatment regimens.

In [None]:
#---- Ramicane drug
ramicane = dataset[dataset['Drug Regimen']==four[0]]

rquartiles = ramicane['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
rlowerq = round(rquartiles[0.25],2)
rupperq = round(rquartiles[0.75],2)
riqr = round(rupperq - rlowerq,2)


print("Ramicane")
print("--------------------------------------------------------------")
print(f"The lower quartile of Ramicane is: {rlowerq}")
print(f"The upper quartile of Ramicane is: {rupperq}")
print(f"The interquartile range of Ramicane is: {riqr}")
print(f"The the median of Ramicane is: {round(rquartiles[0.5],2)} ")

rlower_bound = round(rlowerq - (1.5*riqr),2)
rupper_bound = round(rupperq + (1.5*riqr),2)

print(f"Values below {rlower_bound} could be outliers.")
print(f"Values above {rupper_bound} could be outliers.")


In [None]:
#---- Capomulin drug
capomulin = dataset[dataset['Drug Regimen']==four[1]]

caquartiles = capomulin['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
calowerq = round(caquartiles[0.25],2)
caupperq = round(caquartiles[0.75],2)
caiqr = round(caupperq - calowerq,2)

print(f"The lower quartile of Capomulin is: {calowerq}")
print(f"The upper quartile of Capomulin is: {caupperq}")
print(f"The interquartile range of Capomulin is: {caiqr}")
print(f"The the median of Capomulin is: {round(caquartiles[0.5],2)} ")

calower_bound = round(calowerq - (1.5*riqr),2)
caupper_bound = round(caupperq + (1.5*riqr),2)
print(f"\nValues below {calower_bound} could be outliers.")
print(f"Values above {caupper_bound} could be outliers.")


In [None]:
#---- Ceftamin drug
ceftamin = dataset[dataset['Drug Regimen']==four[2]]

cequartiles = ceftamin['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
celowerq = round(cequartiles[0.25],2)
ceupperq = round(cequartiles[0.75],2)
ceiqr = round(ceupperq - celowerq,2)

print(f"The lower quartile of Ceftamin is: {celowerq}")
print(f"The upper quartile of Ceftamin is: {ceupperq}")
print(f"The interquartile range of Ceftamin is: {ceiqr}")
print(f"The the median of Ceftamin is: {round(cequartiles[0.5],2)} ")

celower_bound = round(celowerq - (1.5*ceiqr),2)
ceupper_bound = round(ceupperq + (1.5*ceiqr),2)
print(f"\nValues below {celower_bound} could be outliers.")
print(f"Values above {ceupper_bound} could be outliers.")

In [None]:
#---- Infubinol drug
infubinol = dataset[dataset['Drug Regimen']==four[3]]

inquartiles = infubinol['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
inlowerq = round(inquartiles[0.25],2)
inupperq = round(inquartiles[0.75],2)
iniqr = round(inupperq - inlowerq,2)

print(f"The lower quartile of Ceftamin is: {inlowerq}")
print(f"The upper quartile of Ceftamin is: {inupperq}")
print(f"The interquartile range of Ceftamin is: {iniqr}")
print(f"The the median of Ceftamin is: {round(inquartiles[0.5],2)} ")

inlower_bound = round(inlowerq - (1.5*iniqr),2)
inupper_bound = round(inupperq + (1.5*iniqr),2)
print(f"\nValues below {inlower_bound} could be outliers.")
print(f"Values above {inupper_bound} could be outliers.")

### Box and whisker plot of the final tumor volume for all four treatment regimens

In [None]:
#------ Defining the variables to use in the boxplot
rami = ramicane['Tumor Volume (mm3)']
capo = capomulin['Tumor Volume (mm3)']
cefta = ceftamin['Tumor Volume (mm3)']
infu = infubinol['Tumor Volume (mm3)']

#------ 
fig1, ax5 = plt.subplots()
ax5.set_title('Drug Treatment')
ax5.set_ylabel('Volumen')

flierprops = dict(marker='o', markerfacecolor='#A30E64', markersize=8,
                  linestyle='none', markeredgecolor='#F7EEF9')


ax5.set_xticklabels(four)
ax5.boxplot([rami,capo,cefta,infu], flierprops=flierprops, notch=True)


plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#------ select all the mice use in capomulin drug test
cap_mouse = capomulin['Mouse ID'].drop_duplicates().tolist()

#------ cap_mouse -> returns 25 mice; 
#------ select a number between  0 an 24 to use a mouse in specific

mouse_plot = cap_mouse[1]

#------ Filter the information

cap_data = capomulin.loc[(capomulin['Mouse ID'] == mouse_plot)]

#----- create the x axis labels
x_axis = cap_data['Timepoint'].tolist()

#----- Plot the Time vs Tumor of the chosen mouse

cap_data.plot(x = 'Timepoint', y = 'Tumor Volume (mm3)', kind = 'line',legend = False, 
              color = '#660033')

#----- Formatting the plot
plt.ylabel('Tumor Volume (mm3)')
plt.xticks(x_axis)
plt.legend(['Tumor Volumen change in time'], loc = 'best')
plt.title(f'Timepoint vs Tumor Volume for mouse: {mouse_plot}')


plt.show()


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
# Fixing random state for reproducibility

w_t = (pd.DataFrame(capomulin.groupby('Weight (g)')['Tumor Volume (mm3)'].mean())).reset_index()
X = w_t['Weight (g)']
w_t.plot('Weight (g)', 'Tumor Volume (mm3)', kind = 'scatter', s=40,
         c=np.random.rand(len(X),3), alpha = 0.65)
plt.show()


In [None]:
# Calculate the correlation coefficient and linear regression model 
#for mouse weight and average tumor volume for the Capomulin regimen