In [1]:
#Import the Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore") 

In [2]:
# Load Data

df = pd.read_csv("Rainfall.csv")

In [3]:
# Explore the dataframe; the first 10 rows

df.head(10)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7
5,6,1018.8,24.3,20.9,19.2,18.0,84,51,yes,7.7,20.0,14.5
6,7,1021.8,21.4,18.8,17.0,15.0,79,56,no,3.4,30.0,21.5
7,8,1020.8,21.0,18.4,16.5,14.4,78,28,no,7.7,60.0,14.3
8,9,1020.6,18.9,18.1,17.1,14.3,78,79,no,3.3,70.0,39.3
9,10,1017.5,18.5,18.0,17.2,15.5,85,91,yes,0.0,70.0,37.7


In [4]:
# An overview of the dataframe:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   day                     366 non-null    int64  
 1   pressure                366 non-null    float64
 2   maxtemp                 366 non-null    float64
 3   temparature             366 non-null    float64
 4   mintemp                 366 non-null    float64
 5   dewpoint                366 non-null    float64
 6   humidity                366 non-null    int64  
 7   cloud                   366 non-null    int64  
 8   rainfall                366 non-null    object 
 9   sunshine                366 non-null    float64
 10           winddirection  365 non-null    float64
 11  windspeed               365 non-null    float64
dtypes: float64(8), int64(3), object(1)
memory usage: 34.4+ KB


In [5]:
# Remove the unwanted column:

df.drop(columns = ["day", "pressure ", "maxtemp", "mintemp", "dewpoint"], inplace = True)

In [6]:
df.head(10)

Unnamed: 0,temparature,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,18.3,72,49,yes,9.3,80.0,26.3
1,18.9,81,83,yes,0.6,50.0,15.3
2,19.3,95,91,yes,0.0,40.0,14.2
3,20.6,90,88,yes,1.0,50.0,16.9
4,20.7,95,81,yes,0.0,40.0,13.7
5,20.9,84,51,yes,7.7,20.0,14.5
6,18.8,79,56,no,3.4,30.0,21.5
7,18.4,78,28,no,7.7,60.0,14.3
8,18.1,78,79,no,3.3,70.0,39.3
9,18.0,85,91,yes,0.0,70.0,37.7


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   temparature             366 non-null    float64
 1   humidity                366 non-null    int64  
 2   cloud                   366 non-null    int64  
 3   rainfall                366 non-null    object 
 4   sunshine                366 non-null    float64
 5            winddirection  365 non-null    float64
 6   windspeed               365 non-null    float64
dtypes: float64(4), int64(2), object(1)
memory usage: 20.1+ KB


In [8]:
#To Display in Proper Order for Better Underatanding of Data

display = df[['temparature', 'humidity ', 'cloud ', 'sunshine', 'windspeed', 'rainfall']]
display

Unnamed: 0,temparature,humidity,cloud,sunshine,windspeed,rainfall
0,18.3,72,49,9.3,26.3,yes
1,18.9,81,83,0.6,15.3,yes
2,19.3,95,91,0.0,14.2,yes
3,20.6,90,88,1.0,16.9,yes
4,20.7,95,81,0.0,13.7,yes
...,...,...,...,...,...,...
361,17.7,84,90,0.0,18.4,yes
362,17.3,75,85,1.0,25.9,yes
363,17.7,75,78,4.6,33.4,yes
364,17.3,78,86,1.2,20.9,yes


In [9]:
# Convert categorical rainfall labels as needed

df['rainfall'] = df['rainfall'].map({'no': 0, 'yes': 1 })

In [10]:
display2 = df['rainfall']
display2

0      1
1      1
2      1
3      1
4      1
      ..
361    1
362    1
363    1
364    1
365    0
Name: rainfall, Length: 366, dtype: int64

In [11]:
# Features & Label

X = df[['temparature', 'humidity ', 'cloud ', 'sunshine']]
y = df['rainfall']

In [12]:
# Dataset summary:

print(X.shape, y.shape)

(366, 4) (366,)


In [13]:
# Split into train and test sets:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("input train: ", X.shape)
print("\ninput test: ", X.shape)
print("\noutput train: ", y.shape)
print("\noutput test: ", y.shape)

input train:  (366, 4)

input test:  (366, 4)

output train:  (366,)

output test:  (366,)


In [14]:
# Standardize the training and test set:

sc = StandardScaler()
input_train = sc.fit_transform(X_train)
input_test = sc.transform(X_test)

In [15]:
# Print the first 10 rows of the 'input_train' 2D array:

print(input_train[:10])

[[-1.23507605 -0.83026135  0.7225914  -0.95330155]
 [ 1.06778283 -0.43135339 -0.94412028  1.03735294]
 [-1.59773099  0.96482449  1.35323907 -1.18008497]
 [-1.88785494 -0.53108038  0.81268393 -1.12968866]
 [-1.18067781 -0.53108038  0.81268393 -1.18008497]
 [-1.14441231  0.96482449  0.40726757 -0.77691444]
 [-0.79989012  1.06455148  0.27212879 -1.18008497]
 [ 0.88645536 -1.92725826 -1.79999925  1.61691058]
 [-1.03561583 -0.23189941 -1.89009177  0.7601732 ]
 [ 0.14301273  0.7653705   0.49736009 -1.1044905 ]]


In [16]:
# Make a Random Forest Regressor model and train it:

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(input_train, y_train)

# Make predictions:

predictions_forest = model.predict(input_test)

In [17]:
print("Accuracy:", accuracy_score(y_test, predictions_forest))
print(classification_report(y_test, predictions_forest))

Accuracy: 0.7432432432432432
              precision    recall  f1-score   support

           0       0.62      0.43      0.51        23
           1       0.78      0.88      0.83        51

    accuracy                           0.74        74
   macro avg       0.70      0.66      0.67        74
weighted avg       0.73      0.74      0.73        74



In [18]:
# Test Sample expected "Yes"
# Values taken similar to rows with rainfall = "yes"

new_data = np.array([[20.0, 95, 81, 0.0]])  # [temparature, humidity, cloud, sunshine]

In [19]:
#Test Sample

new_data = np.array([[20.0, 95, 81, 0.0]])
new_data = sc.transform(new_data) 
prediction = model.predict(new_data)[0]

print("Will it rain tomorrow? :", "Yes" if prediction == 1 else "No")

Will it rain tomorrow? : Yes


In [20]:
#Test Sample 2

new_data = np.array([[19, 70, 40, 8]])
new_data = sc.transform(new_data) 
prediction = model.predict(new_data)[0]

print("Will it rain tomorrow? :", "Yes" if prediction == 1 else "No")

Will it rain tomorrow? : No


In [21]:
#Test Sample 3

new_data = np.array([[18.0, 70, 30, 8.0]])
new_data = sc.transform(new_data) 
prediction = model.predict(new_data)[0]

print("Will it rain tomorrow? :", "Yes" if prediction == 1 else "No")

Will it rain tomorrow? : No


In [22]:
#Test Sample 4

new_data = np.array([[19.0, 99, 70, 1.0]])
new_data = sc.transform(new_data) 
prediction = model.predict(new_data)[0]

print("Will it rain tomorrow? :", "Yes" if prediction == 1 else "No")

Will it rain tomorrow? : Yes
