In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the weatherAUS.csv file
df = pd.read_csv('data/weatherAUS.csv')

# Step 1: Drop rows where RainTomorrow is NaN
print(f"Original shape: {df.shape}")
print(f"NaN count in RainTomorrow: {df['RainTomorrow'].isna().sum()}")
df = df.dropna(subset=['RainTomorrow'])
print(f"Shape after dropping NaN targets: {df.shape}")
print(f"NaN count in RainTomorrow after dropping: {df['RainTomorrow'].isna().sum()}")

# Step 2: Manually map the target and VERIFY it's properly converted
# Create a new target column to be certain
df['RainTomorrow_numeric'] = np.where(df['RainTomorrow'] == 'Yes', 1, 0)
print("\nTarget mapping verification:")
print(df[['RainTomorrow', 'RainTomorrow_numeric']].head(10))
print(f"Unique values in RainTomorrow: {df['RainTomorrow'].unique()}")
print(f"Unique values in RainTomorrow_numeric: {df['RainTomorrow_numeric'].unique()}")
print(f"NaN in RainTomorrow_numeric: {df['RainTomorrow_numeric'].isna().sum()}")

# Step 3: Handle all other missing values (complete case approach)
df_complete = df.dropna()
print(f"\nShape after dropping all NaNs: {df_complete.shape}")
print(f"Total NaNs remaining: {df_complete.isna().sum().sum()}")

# Step 4: Use this clean dataframe for train/test split
train_df, test_df = train_test_split(
    df_complete,
    test_size=0.2,
    random_state=42,
    stratify=df_complete['RainTomorrow_numeric']
)

# Step 5: Replace the original RainTomorrow with our numeric version
train_df['RainTomorrow'] = train_df['RainTomorrow_numeric']
test_df['RainTomorrow'] = test_df['RainTomorrow_numeric']

# Step 6: Drop the temporary column
train_df = train_df.drop('RainTomorrow_numeric', axis=1)
test_df = test_df.drop('RainTomorrow_numeric', axis=1)

# Step 7: Verify our train and test datasets
print("\nVerification of prepared datasets:")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Train target values: {train_df['RainTomorrow'].value_counts().to_dict()}")
print(f"Test target values: {test_df['RainTomorrow'].value_counts().to_dict()}")
print(f"Train NaNs: {train_df.isna().sum().sum()}")
print(f"Test NaNs: {test_df.isna().sum().sum()}")

# Step 8: Save the files
os.makedirs('data', exist_ok=True)
train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)

# Step 9: One final verification - this is crucial
# Specifically test the _read_data function logic from problem.py
def test_read_function():
    train_path = os.path.join('data', 'test.csv')
    df = pd.read_csv(train_path)
    
    # Process target column (don't map - it's already numeric)
    y_array = df['RainTomorrow'].values
    X_df = df.drop('RainTomorrow', axis=1)
    
    print("\nFinal verification:")
    print(f"X shape: {X_df.shape}")
    print(f"y shape: {y_array.shape}")
    print(f"y contains NaN: {np.isnan(y_array).any()}")
    print(f"y unique values: {np.unique(y_array)}")
    
    return X_df, y_array

test_read_function()

Original shape: (145460, 23)
NaN count in RainTomorrow: 3267
Shape after dropping NaN targets: (142193, 23)
NaN count in RainTomorrow after dropping: 0

Target mapping verification:
  RainTomorrow  RainTomorrow_numeric
0           No                     0
1           No                     0
2           No                     0
3           No                     0
4           No                     0
5           No                     0
6           No                     0
7           No                     0
8          Yes                     1
9           No                     0
Unique values in RainTomorrow: ['No' 'Yes']
Unique values in RainTomorrow_numeric: [0 1]
NaN in RainTomorrow_numeric: 0

Shape after dropping all NaNs: (56420, 24)
Total NaNs remaining: 0

Verification of prepared datasets:
Train shape: (45136, 23)
Test shape: (11284, 23)
Train target values: {0: 35194, 1: 9942}
Test target values: {0: 8799, 1: 2485}
Train NaNs: 0
Test NaNs: 0

Final verification:
X shape: (

(             Date      Location  MinTemp  MaxTemp  Rainfall  Evaporation  \
 0      2012-04-23         Perth     11.3     23.1       0.0          5.0   
 1      2013-04-13      Portland     13.5     19.0       0.2          4.6   
 2      2011-05-27        Cairns     13.2     25.4       0.0          6.0   
 3      2017-06-15       Mildura      6.2     18.6       0.0          6.6   
 4      2011-02-16         Moree     19.9     32.3       1.0          4.8   
 ...           ...           ...      ...      ...       ...          ...   
 11279  2012-08-10        Sydney      7.6     14.9       0.0          5.0   
 11280  2010-05-27  AliceSprings      6.8     21.7       0.0          4.0   
 11281  2013-03-26          Sale     11.5     29.1       0.0          5.0   
 11282  2010-01-09       Mildura     18.0     42.0       0.0         12.0   
 11283  2009-06-11          Sale      3.9     12.2       0.0          2.2   
 
        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... WindSpeed3pm  