In [None]:
#Import the relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm 
import statsmodels.formula.api as smf


In [None]:
#Read from the csv file using Pandas
df=pd.read_csv('clinical_data.csv')

#Displays column headings,info about the csv, shape of the dataset and brief statistical summary 
print(df.head())
print(f"Dataframe shape: {df.shape}")
print(df.info())
print(df.describe())

In [None]:
#Tidy the dataset by removing duplicates any missing values 
print(f"Missing values: {df.isnull().sum()}")
df=df.dropna()

In [None]:
#Convert groups to categorical, and check to see if successful
df['group']=df['group'].astype('category')
df=pd.get_dummies(df,columns=['group'], drop_first=True)
df.rename(columns={'group_treatment':'group_treatment'}, inplace=True)
print(df.head())
print(df.columns)

In [None]:
#Convert all columns used in modelling to numeric types
df['group_treatment']=pd.to_numeric(df['group_treatment'], errors='coerce')
df['hba1c']=pd.to_numeric(df['hba1c'], errors='coerce')
df['time']=pd.to_numeric(df['time'], errors='coerce')

In [None]:
#Calculate the averages
avg_hba1c=df[df['group_treatment']==1]['hba1c'].mean()
avg_placebo=df[df['group_treatment']==0]['hba1c'].mean()
print(f"Average Hba1c for treatment group: {avg_hba1c}")
print(f"Average placebo for treatment group: {avg_placebo}")

# Calculate the hba1c change
df['hba1c_change']=df.groupby('subject_id')['hba1c'].transform(lambda x:x -x.iloc[0])

In [None]:
#Count occurences of each type of adverse event
adverse_events=df['adverse_event'].value_counts()
print(adverse_events)

In [None]:
#Data visualistion of Hba1c levels over time
plt.figure(figsize=(10,6))
sns.lineplot(x='time', y='hba1c', hue='group_treatment', data=df)
plt.title('Hba1c Levels over Time')
plt.xlabel('Time (weeks)')
plt.show()

In [None]:
# Box plot of final levels Hba1c placebo vs treatment
plt.figure(figsize=(8,6))
sns.boxplot(x='group_treatment', y='hba1c',hue='group_treatment', data=df)
plt.title('Final Hba1c Levels: Treatment vs Placebo')
plt.xlabel('Group')
plt.ylabel('Hba1c Level')
plt.show()

In [None]:
# Barchart of adverse events
plt.figure(figsize=(12,6))
adverse_events.plot(kind='bar')
plt.title('Most Common Adverse Events')
plt.xlabel('Adverse Event')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter graph of hba1c change
plt.figure(figsize=(10,6))
sns.scatterplot(x='initial_hba1c', y='hba1c_change', hue='group_treatment', data=df)
plt.title('Intial Hba1c vs Change in Hba1c')
plt.xlabel('Initial Hba1c')
plt.ylabel('Change in Hba1c')
plt.show()

In [None]:
#Perform a t-test and print out the results
t_stat, p_value = stats.ttest_ind(df[df['group_treatment']==1]['hba1c'], df[df['group_treatment']==0]['hba1c'])
print(f"T-test result: t-stat = {t_stat}, p-value = {p_value}")
                                

In [None]:
#Perform ANOVA using statsmodels formula API
model =smf.ols('hba1c ~ time + group_treatment', data=df).fit()
aov_table=sm.stats.anova_lm(model, typ=2)
print(aov_table)

In [None]:
#Use chi=squared test to compare adverse event rates between groups
adverse_event_table=pd.crosstab(df['group_treatment'], df['adverse_event'])

#Create a contingency table for adverse evnets by treatment group
chi2, p, dof, expected = stats.chi2_contingency(adverse_event_table, correction=False)
print(f"Chi_squared test result:  chi2= {chi2}, p-value = {p}")

In [None]:
# Check if age_group is in dataset
if 'age_group' not in df.columns:
    df['age_group']=pd.cut(df['age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle-aged', 'Old'])

In [None]:
#Create box plot of hba1c by age group and treatment group
sns.boxplot(x='age_group', y='hba1c', hue='group_treatment', data=df)
plt.title('Hba1c Reduction by AGe Group and Gender')
plt.xlabel('Age Group')
plt.ylabel('Hba1c Reduction')
plt.show()


In [67]:
# Calculate the correlation between duration of diabetes and hba1c change
correlation = df['duration_diabetes'].corr(df['hba1c_change'])
print(f"Correlation between Duration of Diabetes and Hba1c reduction: {correlation}")

Correlation between Duration of Diabetes and Hba1c reduction: -0.01094921262555449
