## Observations and Insights 

### There are several areas where the comments in here depart from the directions in the readme.

When that happens, I generally go with what the comments here say.

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# region
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


def spaceless_lowers(dataframe):
    """
    :param dataframe: a dataframe with columns that have spaces and uppercase letters
    :return: a dataframe with the spaces replaced with _ and all caps made lowercase.
    """
    try:
        cols = dataframe.columns
        cols = [col.replace(' ', '_').lower() for col in cols]
        dataframe.columns = cols

        return dataframe

    except NameError:
        print('There is an unresolved reference to the dataframe in the function\'s argument.\n'
              'Make sure that the dataframe has been read and defined.')


# endregion

mouse_metadata_path = 'pymaceuticals/data/Mouse_metadata.csv'
study_results_path = 'pymaceuticals/data/Study_results.csv'

# Read the mouse data and the study results

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata, study_results)
# region
mouse_cols_original = mouse_metadata.columns
study_cols_original = study_results.columns
combined_cols_original = combined_df.columns

df_list = [mouse_metadata, study_results, combined_df]

for df in df_list:
    df = spaceless_lowers(df)

combined_df.sex.replace(
        {
                'Female': 0,
                'Male': 1
        },
        inplace=True
)

# endregion
# Display the data table for preview
combined_df

In [2]:
# Checking the number of mice.
print(combined_df.mouse_id.nunique())

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
mice = combined_df[combined_df.duplicated(subset=['mouse_id', 'timepoint'])]['mouse_id'].unique()
print(combined_df.drop(combined_df[~combined_df.mouse_id.isin(mice)].index)[0:10])

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 
# im not entirely sure what this is asking, so i hope this is correct.
optional_df = combined_df.drop(combined_df[~combined_df.mouse_id.isin(mice)].index)
print(optional_df)

In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_df = combined_df.drop(combined_df[combined_df.mouse_id.isin(mice)].index)

In [6]:
# Checking the number of mice in the clean DataFrame.
print(cleaned_df.mouse_id.nunique())

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
stats_to_get = 'mean median var std sem'.split()


# region
def the_mean(gbo):
    return gbo.mean()


def the_median(gbo):
    return gbo.median()


def the_var(gbo):
    return gbo.var()


def the_std(gbo):
    return gbo.std()


def the_sem(gbo):
    return gbo.sem()


# endregion

stats_table = cleaned_df.groupby('drug_regimen')

fcns = [the_mean, the_median, the_var, the_std, the_sem]
names = ['mean', 'median', 'var', 'stdev', 'sem']
names = [name.title() if name != 'sem' else name.upper() for name in names]
fcn_names = dict(zip(names, fcns))

the_long_way = {}

for k, v in fcn_names.items():
    the_long_way[k] = v(stats_table)['tumor_volume_(mm3)']

# is rounding ok?
df_long_way = round(pd.DataFrame(the_long_way).rename_axis(mouse_cols_original[1]), 3)
print(df_long_way)

Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen.

Using the aggregation method, produce the same summary statistics in a single line


In [None]:
why_couldnt_i_do_this_earlier = stats_table.agg(stats_to_get)['tumor_volume_(mm3)']

why_couldnt_i_do_this_earlier.columns = [col.title() if col != 'sem' else col.upper() for col in
                                         why_couldnt_i_do_this_earlier.columns]
# Assemble the resulting series into a single summary dataframe.
print(why_couldnt_i_do_this_earlier)

## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
# the default backend is plt, so these next things are essentially the same.
# pd.options.plotting.backend = 'plotly'

gbo_drugs = combined_df.groupby('drug_regimen')['mouse_id']

gbo_drugs.count().plot(
        kind='bar',
        title='Total Timepoints for all Mice in each Regimen',
        figsize=(10, 9),
        xlabel='Drug Used',
        ylabel='Count of Mice',
        rot=45
)

plt.savefig('gbo_drugs_pd.png')

plt.show()

In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
x, y = gbo_drugs.groups.keys(), gbo_drugs.count()

fig, ax = plt.subplots(figsize=(10, 9))

plot1 = ax.bar(
        x,
        height=y,
        label='Drug Regimen'
)

plt.xticks(rotation=45)

plt.xlabel('Drug Regimen')
plt.ylabel('Count of Mice')
plt.title('Total Timepoints for all Mice in each Regimen')

plt.tight_layout()

plt.savefig('gbo_drugs_plt.png')
plt.show()

In [11]:
gbo_sex = combined_df.groupby('sex')
fig, ax = plt.subplots()
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gbo_sex.count().plot(
        kind='pie',
        y='mouse_id',
        title='Male vs Female Mice',
        ax=ax,
        ylabel='Count of Sex',
        figsize=(6, 6),
        labels=['Female', 'Male'],
        autopct='%2.1f%%'
)

ax.legend(['Female', 'Male'])
plt.tight_layout()
plt.savefig('gbo_sex_pd.png')
plt.show()



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
fig, ax = plt.subplots(figsize=(8, 8))
labels = ['Female', "Male"]

sizes = gbo_sex['sex'].count()

ax.pie(
        x=sizes,
        labels=labels,
        autopct='%3.1f%%',
        textprops={'fontsize': 14}
)

plt.legend()

plt.tight_layout()

plt.savefig('gbo_sex_plt.png')
plt.show()


## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

best_drugs = 'Capomulin Ramicane Infubinol Ceftamin'.split()
best_drugs_df = combined_df[combined_df['drug_regimen'].isin(best_drugs)]

best_drugs_df['drug_regimen'].unique()

best_drugs_df.sort_values('timepoint', inplace=True)
# Start by getting the last (greatest) timepoint for each mouse

gbo_best_drugs = best_drugs_df.groupby(
        [
                'drug_regimen', 'mouse_id'
        ]
).last()['tumor_volume_(mm3)']

gbo_best_drugs.head(30)
best_drugs_df = gbo_best_drugs.reset_index()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
concat_df = pd.concat([cleaned_df, best_drugs_df])
cconc = concat_df[concat_df['drug_regimen'].isin(best_drugs)]
cconc.dropna(inplace=True)
tvol_gbo = cconc.groupby('drug_regimen')

quants = tvol_gbo['tumor_volume_(mm3)'].quantile([0.25, 0.75])

In [14]:
# Put treatments into a list for for loop (and later for plot labels)
# Create empty list to fill with tumor vol data (for plotting)
# Calculate the IQR and quantitatively determine if there are any potential outliers.
# Locate the rows which contain mice on each drug and get the tumor volumes

list_tvol = cconc.groupby('drug_regimen')['tumor_volume_(mm3)'].apply(list)

quant_gbo = quants.groupby('drug_regimen')
iqrs = quant_gbo.last() - quant_gbo.first()
print(iqrs)

mice = tvol_gbo.mouse_id.sample(1)

# add subset
# i dont understand this instruction

# Determine outliers using upper and lower bounds
d = {}
for drug in iqrs.index:
    lbound = quants[drug][0.25] - 1.5 * iqrs[drug]
    ubound = quants[drug][0.75] + 1.5 * iqrs[drug]

    d[drug] = [lbound, ubound]
    print(
            f'{drug}:\n'
            f'Lowerbound: {round(d[drug][0], 3)}\n'
            f'Upperbound: {round(d[drug][1], 3)}\n\n'
    )

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig, ax = plt.subplots()

ax.boxplot(
        list_tvol,
        labels=best_drugs
)

ax.set_title('Tumor Volume Based on Drug Regimen')
ax.set_xlabel('Drug Regimen')
ax.set_ylabel('Tumor Volume (mm³)')

plt.savefig('boxplot.png')
plt.show()

## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
