Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

start_time_pandas = time.time()
pandas_data = pd.DataFrame(np.random.randint(1, 101, size=(1000000, 1)), columns=['value'])
end_time_pandas = time.time()
pandas_time = end_time_pandas - start_time_pandas

start_time_numpy = time.time()
numpy_data = np.random.randint(1, 101, size=(1000000, 1))
end_time_numpy = time.time()
numpy_time = end_time_numpy - start_time_numpy

print("Time taken to generate a dataset using pandas:", pandas_time, "seconds")
print("Time taken to generate a dataset using numpy:", numpy_time, "seconds")

if numpy_time < pandas_time:
    print("numpy is faster because it is a library specifically for numerical computing and avoids some of the additional data structure and indexing processing overheads that may be involved in pandas when generating an array of random numbers.")
else:
    print("pandas is faster, which may be related to the specific running environment and the underlying optimization of data processing.")

AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

# Use pandas to load the dataset and calculate relevant statistics
start_time_pandas = time.time()
data_pandas = pd.read_csv('sleep_health.csv')  # Assume the data is stored in the sleep_health.csv file
mean_sleep_duration_pandas = data_pandas['Sleep Duration'].mean()
mean_systolic_bp_pandas = data_pandas['Systolic blood pressure'].mean()
mean_diastolic_bp_pandas = data_pandas['Diastolic blood pressure'].mean()
mean_heart_rate_pandas = data_pandas['Heart Rate'].mean()
mean_daily_steps_pandas = data_pandas['Daily Steps'].mean()

correlation_matrix_pandas = data_pandas.corr()
correlation_sleep_age_pandas = correlation_matrix_pandas.loc['Sleep Duration', 'Age']
correlation_sleep_heart_rate_pandas = correlation_matrix_pandas.loc['Sleep Duration', 'Heart Rate']
correlation_sleep_daily_steps_pandas = correlation_matrix_pandas.loc['Sleep Duration', 'Daily Steps']

std_sleep_duration_pandas = data_pandas['Sleep Duration'].std()

end_time_pandas = time.time()
time_taken_pandas = end_time_pandas - start_time_pandas

# Use NumPy to load the dataset and calculate relevant statistics
start_time_numpy = time.time()
data_numpy = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1)
sleep_duration_column = data_numpy[:, 4]
systolic_bp_column = data_numpy[:, 9]
diastolic_bp_column = data_numpy[:, 10]
heart_rate_column = data_numpy[:, 11]
daily_steps_column = data_numpy[:, 12]

mean_sleep_duration_numpy = np.mean(sleep_duration_column)
mean_systolic_bp_numpy = np.mean(systolic_bp_column)
mean_diastolic_bp_numpy = np.mean(diastolic_bp_column)
mean_heart_rate_numpy = np.mean(heart_rate_column)
mean_daily_steps_numpy = np.mean(daily_steps_column)

correlation_matrix_numpy = np.corrcoef(data_numpy[:, 4:], rowvar=False)
correlation_sleep_age_numpy = correlation_matrix_numpy[0, 1]  # Assume age is the 3rd column (index 2), here calculate the correlation between sleep duration (4th column, index 3) and age
correlation_sleep_heart_rate_numpy = correlation_matrix_numpy[0, 7]  # Assume heart rate is the 11th column (index 10)
correlation_sleep_daily_steps_numpy = correlation_matrix_numpy[0, 8]  # Assume daily steps is the 12th column (index 11)

std_sleep_duration_numpy = np.std(sleep_duration_column)

end_time_numpy = time.time()
time_taken_numpy = end_time_numpy - start_time_numpy

print("Results calculated using pandas:")
print("Mean sleep duration:", mean_sleep_duration_pandas)
print("Mean systolic blood pressure:", mean_systolic_bp_pandas)
print("Mean diastolic blood pressure:", mean_diastolic_bp_pandas)
print("Mean heart rate:", mean_heart_rate_pandas)
print("Mean daily steps:", mean_daily_steps_pandas)
print("Correlation between sleep duration and age:", correlation_sleep_age_pandas)
print("Correlation between sleep duration and heart rate:", correlation_sleep_heart_rate_pandas)
print("Correlation between sleep duration and daily steps:", correlation_sleep_daily_steps_pandas)
print("Standard deviation of sleep duration:", std_sleep_duration_pandas)
print("Total time taken using pandas:", time_taken_pandas, "seconds")

print("Results calculated using NumPy:")
print("Mean sleep duration:", mean_sleep_duration_numpy)
print("Mean systolic blood pressure:", mean_systolic_bp_numpy)
print("Mean diastolic blood pressure:", mean_diastolic_bp_numpy)
print("Mean heart rate:", mean_heart_rate_numpy)
print("Mean daily steps:", mean_daily_steps_numpy)
print("Correlation between sleep duration and age:", correlation_sleep_age_numpy)
print("Correlation between sleep duration and heart rate:", correlation_sleep_heart_rate_numpy)
print("Correlation between sleep duration and daily steps:", correlation_sleep_daily_steps_numpy)
print("Standard deviation of sleep duration:", std_sleep_duration_numpy)
print("Total time taken using NumPy:", time_taken_numpy, "seconds")

if time_taken_numpy < time_taken_pandas:
    print("NumPy is faster because it is a library specifically for numerical computing and avoids some additional overheads in pandas during data processing, such as the creation of data structures and indexing operations.")
else:
    print("Pandas is faster, which may be related to the specific structure and processing method of the data, or the underlying optimization of pandas in this particular environment is more suitable for this dataset.")

AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Use pandas for data visualization
start_time_pandas = time.time()
data_pandas = pd.read_csv('sleep_health.csv')

# 1. Plot the distribution of each column
plt.figure(figsize=(12, 8))
plt.subplot(3, 2, 1)
sns.histplot(data_pandas['Age'], kde=True)
plt.title('Age Distribution')
plt.subplot(3, 2, 2)
sns.histplot(data_pandas['Sleep Duration'], kde=True)
plt.title('Sleep Duration Distribution')
plt.subplot(3, 2, 3)
sns.histplot(data_pandas['Quality of Sleep'], kde=True)
plt.title('Quality of Sleep Distribution')
plt.subplot(3, 2, 4)
sns.histplot(data_pandas['Physical Activity Level'], kde=True)
plt.title('Physical Activity Level Distribution')
plt.subplot(3, 2, 5)
sns.histplot(data_pandas['Stress Level'], kde=True)
plt.title('Stress Level Distribution')
plt.subplot(3, 2, 6)
sns.histplot(data_pandas['Heart Rate'], kde=True)
plt.title('Heart Rate Distribution')
plt.tight_layout()

# 3. Plot the distribution of Sleep Duration based on different conditions
plt.figure(figsize=(12, 8))
plt.subplot(3, 2, 1)
sns.boxplot(x='Quality of Sleep', y='Sleep Duration', data=data_pandas)
plt.title('Sleep Duration by Quality of Sleep')
plt.subplot(3, 2, 2)
sns.boxplot(x='Stress Level', y='Sleep Duration', data=data_pandas)
plt.title('Sleep Duration by Stress Level')
plt.subplot(3, 2, 3)
sns.boxplot(x='Physical Activity Level', y='Sleep Duration', data=data_pandas)
plt.title('Sleep Duration by Physical Activity Level')
plt.subplot(3, 2, 4)
sns.boxplot(x='Occupation', y='Sleep Duration', data=data_pandas)
plt.title('Sleep Duration by Occupation')
plt.subplot(3, 2, 5)
sns.boxplot(x='BMI Category', y='Sleep Duration', data=data_pandas)
plt.title('Sleep Duration by BMI')

# 5. Plot the relationship between variables
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.scatterplot(x='Age', y='Sleep Duration', data=data_pandas)
plt.title('Age vs Sleep Duration')
plt.subplot(2, 2, 2)
sns.scatterplot(x='Sleep Duration', y='Heart Rate', data=data_pandas)
plt.title('Sleep Duration vs Heart Rate')
plt.subplot(2, 2, 3)
sns.scatterplot(x='Heart Rate', y='Daily Steps', data=data_pandas)
plt.title('Heart Rate vs Daily Steps')
plt.subplot(2, 2, 4)
sns.scatterplot(x='Sleep Duration', y='Daily Steps', data=data_pandas)
plt.title('Sleep Duration vs Daily Steps')
plt.tight_layout()

end_time_pandas = time.time()
time_taken_pandas = end_time_pandas - start_time_pandas

# Use NumPy for data visualization
start_time_numpy = time.time()
data_numpy = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1)

# 1. Plot the distribution of each column
plt.figure(figsize=(12, 8))
plt.subplot(3, 2, 1)
plt.hist(data_numpy[:, 2], bins=20, density=True)
plt.title('Age Distribution')
plt.subplot(3, 2, 2)
plt.hist(data_numpy[:, 4], bins=20, density=True)
plt.title('Sleep Duration Distribution')
plt.subplot(3, 2, 3)
plt.hist(data_numpy[:, 5], bins=20, density=True)
plt.title('Quality of Sleep Distribution')
plt.subbar t_3, 2, 4)
plt.hist(data_numpy[:, 6], bins=20, density=True)
plt.title('Physical Activity Level Distribution')
plt.subplot(3, 2, 5)
plt.hist(data_numpy[:, 7], bins=20, density=True)
plt.title('Stress Level Distribution')
plt.subplot(3, 2, 6)
plt.hist(data_numpy[:, 11], bins=20, density=True)
plt.title('Heart Rate Distribution')
plt.tight_layout()

# 3. Plot the distribution of Sleep Duration based on different conditions
plt.figure(figsize=(12, 8))
plt.subplot(3, 2, 1)
unique_quality = np.unique(data_numpy[:, 5])
for quality in unique_quality:
    mask = data_numpy[:, 5] == quality
    plt.hist(data_numpy[mask, 4], bins=10, alpha=0.4, label=f'Quality {quality}')
plt.title('Sleep Duration by Quality of Sleep')
plt.legend()
plt.subplot(3, 2, 2)
unique_stress = np.unique(data_numpy[:, 7])
for stress in unique_stress:
    mask = data_numpy[:, 7] == stress
    plt.hist(data_numpy[mask, 4], bins=10, alpha=0.4, label=f'Stress {stress}')
plt.title('Sleep Duration by Stress Level')
plt.legend()
plt.subplot(3, 2, 3)
unique_activity = np.unique(data_numpy[:, 6])
for activity in unique_activity:
    mask = data_numpy[:, 6] == activity
    plt.hist(data_numpy[mask, 4], bins=10, alpha=0.4, label=f'Activity {activity}')
plt.title('Sleep Duration by Physical Activity Level')
plt.legend()
plt.subplot(3, 2, 4)
unique_occupation = np.unique(data_numpy[:, 3])
for occupation in unique_occupation:
    mask = data_numpy[:, 3] == occupation
    plt.hist(data_numpy[mask, 4], bins=10, alpha=0.4, label=f'Occupation {occupation}')
plt.title('Sleep Duration by Occupation')
plt.legend()
plt.subplot(3, 2, 5)
unique_bmi = np.unique(data_numpy[:, 8])
for bmi in unique_bmi:
    mask = data_numpy[:, 8] == bmi
    plt.hist(data_numpy[mask, 4], bins=10, alpha=0.4, label=f'BMI {bmi}')
plt.title('Sleep Duration by BMI')
plt.legend()

# 5. Plot the relationship between variables
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.scatter(data_numpy[:, 2], data_numpy[:, 4])
plt.title('Age vs Sleep Duration')
plt.subplot(2, 2, 2)
plt.scatter(data_numpy[:, 4], data_numpy[:, 11])
plt.title('Sleep Duration vs Heart Rate')
plt.subplot(2, 2, 3)
plt.scatter(data_numpy[:, 11], data_numpy[:, 12])
plt.title('Heart Rate vs Daily Steps')
plt.subplot(2, 2, 4)
plt.scatter(data_numpy[:, 4], data_numpy[:, 12])
plt.title('Sleep Duration vs Daily Steps')
plt.tight_layout()

end_time_numpy = time.time()
time_taken_numpy = end_time_numpy - start_time_numpy

print("Time taken for plotting using pandas:", time_taken_pandas, "seconds")
print("Time taken for plotting using NumPy:", time_taken_numpy, "seconds")


AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy

