In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [5]:
# Load the data
df = pd.read_csv('../data/weather.csv', parse_dates=['date'])
df.head()

Unnamed: 0,date,temperature,max_temp,min_temp,humidity,precipitation,wind_speed
0,2024-01-01,15.5,20.1,10.0,85,5.2,12.3
1,2024-01-02,14.2,19.5,9.5,80,0.0,8.5
2,2024-01-03,,21.0,11.0,78,3.0,10.2
3,2024-01-04,17.8,22.5,13.0,82,,11.1
4,2024-01-05,16.4,21.8,11.2,75,0.0,9.8


In [6]:
# Data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           10 non-null     datetime64[ns]
 1   temperature    8 non-null      float64       
 2   max_temp       10 non-null     float64       
 3   min_temp       10 non-null     float64       
 4   humidity       10 non-null     int64         
 5   precipitation  9 non-null      float64       
 6   wind_speed     10 non-null     float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 692.0 bytes


In [7]:
# Check for missing values
df.isnull().sum()

date             0
temperature      2
max_temp         0
min_temp         0
humidity         0
precipitation    1
wind_speed       0
dtype: int64

In [8]:
# Basic statistics
df.describe()

Unnamed: 0,date,temperature,max_temp,min_temp,humidity,precipitation,wind_speed
count,10,8.0,10.0,10.0,10.0,9.0,10.0
mean,2024-01-05 12:00:00,15.7,20.63,10.52,79.5,1.866667,10.0
min,2024-01-01 00:00:00,14.2,19.5,9.5,75.0,0.0,8.5
25%,2024-01-03 06:00:00,15.05,19.85,9.85,78.0,0.0,9.275
50%,2024-01-05 12:00:00,15.6,20.5,10.25,79.5,1.2,9.9
75%,2024-01-07 18:00:00,16.1,21.15,10.875,80.75,3.0,10.425
max,2024-01-10 00:00:00,17.8,22.5,13.0,85.0,5.2,12.3
std,,1.087592,1.017677,1.033656,2.798809,1.981792,1.11455


In [9]:
# Fill missing temperature with mean
df['temperature'].fillna(df['temperature'].mean(), inplace=True)

# Fill missing precipitation with 0 (assumption: no rainfall)
df['precipitation'].fillna(0, inplace=True)

# Check again
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['temperature'].fillna(df['temperature'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['precipitation'].fillna(0, inplace=True)


date             0
temperature      0
max_temp         0
min_temp         0
humidity         0
precipitation    0
wind_speed       0
dtype: int64

In [None]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Check for invalid values (e.g., negative precipitation)
df = df[df['precipitation'] >= 0]


In [None]:
# Temperature range
df['temperature_range'] = df['max_temp'] - df['min_temp']

# Is rainy day?
df['is_rainy_day'] = df['precipitation'] > 0

# Day of week
df['day_of_week'] = df['date'].dt.day_name()

In [None]:
# Summary
print("\nSummary Statistics:")
print(df.describe())

# Rainy vs non-rainy
df['is_rainy_day'].value_counts()

# Average temperature per day of week
df.groupby('day_of_week')['temperature'].mean().sort_values()

In [None]:
# Boxplot for temperature
sns.boxplot(x=df['temperature'])
plt.title("Temperature Outliers")
plt.show()

# Remove extreme outliers (IQR method)
Q1 = df['temperature'].quantile(0.25)
Q3 = df['temperature'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

# Keep only non-outliers
df = df[(df['temperature'] >= lower) & (df['temperature'] <= upper)]

In [None]:
# Log transform skewed column (precipitation)
df['precipitation_log'] = np.log1p(df['precipitation'])

In [None]:
# Histogram of temperature
sns.histplot(df['temperature'], kde=True)
plt.title("Temperature Distribution")
plt.show()

# Line plot: temperature over time
plt.figure(figsize=(10,4))
plt.plot(df['date'], df['temperature'], marker='o')
plt.title("Temperature Over Time")
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Boxplot of humidity by day of week
plt.figure(figsize=(8,4))
sns.boxplot(x='day_of_week', y='humidity', data=df)
plt.title("Humidity by Day of Week")
plt.xticks(rotation=45)
plt.show()

# Heatmap of correlation matrix
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Feature Correlations")
plt.show()

## Conclusion
print("\nReview 1 Complete: Preprocessing, Summary Statistics, Visuals Ready.")
