In [1]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/weatherAUS.csv"
data = pd.read_csv(url)

# Display the first few rows of the dataset
print(data.head())

# Information about the dataset
print(data.info())

# Summary statistics
print(data.describe())

# Check for missing values
print(data.isnull().sum())

# Data preprocessing
# Drop irrelevant columns for Problem 1a
data.drop(columns=['Date', 'Location', 'RainTomorrow'], inplace=True)

# Handling missing values
# For numerical columns, fill missing values with mean
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# For categorical columns, fill missing values with mode
categorical_columns = data.select_dtypes(include=['object']).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# For Problem 1b, drop rows where Rainfall is missing
problem_1b_data = data.dropna(subset=['Rainfall'])

# Encoding categorical variables for Problem 1a
data = pd.get_dummies(data, drop_first=True)

# Separate features and target variable for Problem 1a
X1a = data.drop(columns=['RainToday_Yes'])
y1a = data['RainToday_Yes']

# Separate features and target variable for Problem 1b
X1b = problem_1b_data.drop(columns=['Rainfall'])
y1b = problem_1b_data['Rainfall']

# Split the dataset into training and testing sets for both problems
from sklearn.model_selection import train_test_split
X1a_train, X1a_test, y1a_train, y1a_test = train_test_split(X1a, y1a, test_size=0.2, random_state=42)
X1b_train, X1b_test, y1b_train, y1b_test = train_test_split(X1b, y1b, test_size=0.2, random_state=42)

# Train machine learning models for both problems
# For Problem 1a, use classification algorithms like Logistic Regression, Random Forest, etc.
from sklearn.ensemble import RandomForestClassifier
model_1a = RandomForestClassifier()
model_1a.fit(X1a_train, y1a_train)

# For Problem 1b, use regression algorithms like Linear Regression, Random Forest Regressor, etc.
from sklearn.ensemble import RandomForestRegressor
model_1b = RandomForestRegressor()
model_1b.fit(X1b_train, y1b_train)

# Evaluate models
# For Problem 1a
accuracy_1a = model_1a.score(X1a_test, y1a_test)
print("Accuracy for Problem 1a:", accuracy_1a)

# For Problem 1b
score_1b = model_1b.score(X1b_test, y1b_test)
print("Score (R^2) for Problem 1b:", score_1b)


         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

ValueError: could not convert string to float: 'NE'