In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

: 

In [2]:
# isolating the file path
filepath = './lettuce_dataset.csv'

In [3]:
# read archive with pandas library
df = pd.read_csv(filepath, encoding='latin1')

In [4]:
# convert Date to datetime type
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# plot temperature and humidity
plt.figure(figsize=(10, 5))
plt.plot(df['Date'], df['Temperature (°C)'], label='Temperature (°C)', color='red')
plt.plot(df['Date'], df['Humidity (%)'], label='Humidity (%)', color='blue')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Temperature and Humidity Over Time')
plt.legend()
plt.grid()
plt.show()

In [None]:
# plot a TDS vs. Growth Days
plt.figure(figsize=(8, 6))
plt.scatter(df['TDS Value (ppm)'], df['Growth Days'], color='green', alpha=0.6)
plt.xlabel('TDS Value (ppm)')
plt.ylabel('Growth Days')
plt.title('TDS vs. Growth Days')
plt.grid()
plt.show()

In [None]:
# plot a distribution
df['pH Level'].plot(kind='box', title='pH Level Distribution', grid=True, color='blue')
plt.ylabel('pH Level')
plt.show()

In [None]:
# compute correlation matrix
correlation_matrix = df[['Temperature (°C)', 'Humidity (%)', 'TDS Value (ppm)', 'pH Level', 'Growth Days']].corr()

# plot a heatmap
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.title('Correlation Heatmap')
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.show()

In [None]:
# summary statistics for numerical columns
summary_stats = df.describe()
print(summary_stats)

# Challenges for basic analysis course of Google solution
1. Converting Celsius to Farenheit
2. Converting days to weeks
3. Modifying the humidity column 
4. Creating a new spreadsheet to perform exploratory data analysis

In [10]:
# convert Celsius to Fahrenheit
df['Temperature (°F)'] = df['Temperature (°C)'] * 9/5 + 32

# convert days to weeks
df['Growth Weeks'] = (df['Growth Days'] / 7).round(2)

# normalize humidity
df['Normalized Humidity'] = (df['Humidity (%)'] - df['Humidity (%)'].min()) / (df['Humidity (%)'].max() - df['Humidity (%)'].min())

# categorize humidity levels
def categorize_humidity(humidity):
    if humidity < 30:
        return 'Low'
    elif 30 <= humidity <= 60:
        return 'Moderate'
    else:
        return 'High'

df['Humidity Level'] = df['Humidity (%)'].apply(categorize_humidity)

In [11]:
# Save the new dataset
df.to_csv('modified_lettuce_data.csv', index=False)

In [None]:
# reload modified dataset
modified_df = pd.read_csv('modified_lettuce_data.csv')

# quick summary
print(modified_df.describe())

# check for unique values in 'Humidity Level'
print(modified_df['Humidity Level'].value_counts())