## Data Processing 

In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder

# Loadind DATA from KAGGLE

print("Downloadind DATA..")
#data_url = 'https://www.kaggle.com/jsphyg/weather-dataset-rattle-package'
#od.download(data_url)
data_dir = './weather-dataset-rattle-package/weatherAUS.csv'
rain_df = pd.read_csv(data_dir)
rain_df.dropna(subset=['RainToday','RainTomorrow'],inplace=True)

# Spliting DATA in Train,Validation and Test set based on Year

Year = pd.to_datetime(rain_df.Date).dt.year
rain_df['RainTomprrow']   = rain_df.RainTomorrow.map({'No':0,'Yes':1})
train_df, val_df, test_df = rain_df[Year<2015], rain_df[Year == 2015], rain_df[Year>2015]

# Spliting Input and target DATA set
input_cols, target_col = rain_df.columns[1:-1], rain_df.columns[-1]
train_input, train_target = train_df[input_cols].copy(), train_df[target_col].copy()
val_input, val_target     = val_df[input_cols].copy(), val_df[target_col].copy()
test_input, test_target   = test_df[input_cols].copy(),test_df[target_col].copy()

# Splitting columns in Numerical and Categorical

numeric_cols = train_input.select_dtypes(include=np.number).columns.tolist()[:-1]
categorical_cols = train_input.select_dtypes('object').columns.tolist()

# IMPUTING(Filling Missing Values ) of numericals columns

print("IMPUTING(Filling Missing Values with their MEAN) of numericals columns")
imputer=SimpleImputer(strategy='mean')
imputer.fit(rain_df[numeric_cols])
train_input[numeric_cols] = imputer.transform(train_input[numeric_cols])
val_input[numeric_cols] = imputer.transform(val_input[numeric_cols])
test_input[numeric_cols] = imputer.transform(test_input[numeric_cols])

# Scaling Numerical DATA

print("Scaling Numerical DATA Using MinMaxScaler...")
scaler=MinMaxScaler()
scaler.fit(rain_df[numeric_cols])
train_input[numeric_cols] = scaler.transform(train_input[numeric_cols])
val_input[numeric_cols] = scaler.transform(val_input[numeric_cols])
test_input[numeric_cols] = scaler.transform(test_input[numeric_cols])

# Encoding Categorical DATA

print("Encoding Categorical DATA....")
encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
encoder.fit(rain_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
train_input[encoded_cols] = encoder.transform(train_input[categorical_cols])
val_input[encoded_cols] = encoder.transform(val_input[categorical_cols])
test_input[encoded_cols] = encoder.transform(test_input[categorical_cols])
print("Data is Ready to Put In Model")

#Creatind data for fitting,prediction and accuracy measure
x_train = train_input[numeric_cols + encoded_cols]
x_val = val_input[numeric_cols + encoded_cols]
x_test = test_input[numeric_cols + encoded_cols]

## Creating & Training  Models 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
import seaborn as sns

# Creating Linear Regression Models

print("Creating LogisticRegression Model...")
LR_model=LogisticRegression(solver='liblinear',max_iter=100)
LR_model.fit(x_train,train_target)
# Displaying weights of each features
weight_df=pd.DataFrame({'feature':numeric_cols+encoded_cols,
             'weights':LR_model.coef_.tolist()[0]})
sns.barplot(data=weight_df.sort_values('weights',ascending=False).head(20),x='weights',y='feature')
print("Model is created")

# Creating Dicision Tree Classifier Models

DT_base_model=DecisionTreeClassifier(random_state=42).fit(x_train,train_target)
DT_accurate_model=DecisionTreeClassifier(max_depth=9,
                                         max_leaf_nodes=150,
                                         random_state=42).fit(x_train,train_target)

#Creating Random Forest Classifier Models

RF_base_model=DecisionTreeClassifier(random_state=42).fit(x_train,train_target)
RF_accurate_model=DecisionTreeClassifier(n_estimators=100,
                                         max_depth=40,
                                         max_leaf_nodes=150,
                                         max_features=7,
                                         n_jobs=-1,
                                         random_state=42).fit(x_train,train_target)


## Model Prediction & Evaluation

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
import seaborn as sns

def predict_plot(inputs,target,name=''):
    predicted=model.predict(inputs)
    accuracy=accuracy_score(target,predicted)
    print("Accuracy of model on ",name," set=",accuracy*100,"%")
    matri=confusion_matrix(target,predicted,normalize='true')
    
    plt.figure()
    sns.heatmap(matri,annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name))

# Accuracy Score and Confusion matrix on Each Data set

predict_plot(x_train,train_target,'Training')
predict_plot(x_val,val_target,'Validation')
predict_plot(x_test,test_target,'TEST')

# Evaluting accuarcy on Random and Aways_No Dataset 
random_target=np.random.choice([1,0],len(x_test))
print("Accuracy for Random_Yes&No_data",accuracy_score(test_target,random_target))
always_no_target=np.zeros(len(x_test))
print("Accuracy for Always_No_data",accuracy_score(test_target,always_no_target)
      

### model to predict one input 

In [None]:
def Is_Rain_Tomorrow(rain_data):
    x_data=pd.DataFrame([rain_data])
    x_data[numeric_cols]=imputer.transform(x_data[numeric_cols])
    x_data[numeric_cols]=scaler.transform(x_data[numeric_cols])
    x_data[encoded_cols]=encoder.transform(x_data[categorical_cols])
    predicted=model.predict(x_data[numeric_cols+encoded_cols])[0]
    predicted_prob=model.predict_proba(x_data[numeric_cols+encoded_cols])
    return predicted,predicted_prob[0][list(model.classes_).index(predicted)]

new_input = {'Date': '2021-06-19','Location': 'Katherine','MinTemp': 23.2,'MaxTemp': 33.2,'Rainfall': 10.2,
             'Evaporation': 4.2,'Sunshine': np.nan,'WindGustDir': 'NNW','WindGustSpeed': 52.0,'WindDir9am': 'NW',
             'WindDir3pm': 'NNE','WindSpeed9am': 13.0,'WindSpeed3pm': 20.0,'Humidity9am': 89.0,'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,'Pressure3pm': 1001.5,'Cloud9am': 8.0,'Cloud3pm': 5.0,'Temp9am': 25.7,
             'Temp3pm': 33.0,'RainToday': 'Yes'}
if Is_Rain_Tomorrow(new_input)[0]:
    print("Possiblity of RainTomorrow is: ",Is_Rain_Tomorrow(new_input)[1])
else:
    print("Possibility of Not_Rain_Tomorrow is: ",Is_Rain_Tomorrow(new_input)[1])

### Saving Model 

In [None]:
import joblib
australia_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}
joblib.dump(australia_rain, 'australia_rain.joblib')