In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
#from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from xgboost import XGBClassifier
#import statsmodels.api as sm


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def dframe(df): #read csv file
    df=pd.read_csv(df).sample(frac = 0.25).sort_values(by='Date')
    return df
df=dframe(df='../input/weather-dataset-rattle-package/weatherAUS.csv')
print(display(df.head(5)))

In [None]:
miss=df.isnull().sum().sort_values(ascending=False).head(23)
miss_per=(miss/len(df))*100
pd.DataFrame({'No missing values':miss, '% of missing data':miss_per.values})

In [None]:
df.drop(['Date','Location','Evaporation','Sunshine'],axis=1, inplace=True)

In [None]:
X=pd.get_dummies(data=df, columns=['WindGustDir','WindDir9am','WindDir3pm','RainTomorrow'],drop_first=True)
X.drop(['RainToday'], axis=1, inplace=True)

In [None]:
index,columns=X.index, X.columns
vulinNan=IterativeImputer(random_state=0).fit(X).transform(X)
X=pd.DataFrame(vulinNan, index=index,columns=columns)

In [None]:
y=pd.get_dummies(data=df['RainToday'], columns=['RainToday'], drop_first=False)
index,columns=y.index,y.columns
Nan_y=IterativeImputer(random_state=0).fit(y).transform(y)
y=pd.DataFrame(Nan_y, index=index, columns=columns)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
classifier_C=DecisionTreeClassifier( random_state=42).fit(X_train,y_train)
classifier_C

In [None]:
classifier_C. max_features_, classifier_C.feature_importances_

In [None]:
F_import=pd.Series(classifier_C.feature_importances_, index=X_train.columns).nlargest().plot(kind='barh')
plt.show()

In [None]:
f'Max depth:{classifier_C.get_depth()} Decision tree number of leaves:{classifier_C.get_n_leaves()}'

In [None]:
C_params=classifier_C.get_params()
C_params

In [None]:
classifier_C.score(X_train, y_train), classifier_C.score(X_test, y_test)

In [None]:
classifier_grid={'max_depth': range(1,11), 'min_samples_split':range(2,40)}
classifier1=DecisionTreeClassifier()

In [None]:
classifier_CV=GridSearchCV(classifier1, classifier_grid,cv=10,n_jobs=-1, verbose=2)

In [None]:
classifier_CV_model=classifier_CV.fit(X_train,y_train)


In [None]:
classifier_CV_model.best_params_

In [None]:
classifier1=DecisionTreeClassifier(max_depth=8, min_samples_split=3).fit(X_train,y_train)
classifier1.score(X_train,y_train), classifier1.score(X_test,y_test)

In [None]:
y_pred=classifier1.predict(X_test)
accuracy_score(y_test,y_pred)

# ****Random Forest****

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)
rf_model=RandomForestRegressor(random_state=42)
rf_model.get_params()

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
y_pred_test=rf_model.predict(X_test)
y_pred_train=rf_model.predict(X_train)

In [None]:
rf_model.score(X_train, y_train), rf_model.score(X_test,y_test)

# Model Tuning****

In [None]:
prms={'max_depth': list(range(1,10)),
    'max_features':[3,5,7,9,11],
    'n_estimators':[100,200,500,1000]}

In [None]:
rf_model=RandomForestClassifier(random_state=42)
rf_cv_model=GridSearchCV(rf_model,prms, cv=5, n_jobs=-1).fit(X_train,y_train)

In [None]:
rf_cv_model.best_params_

In [None]:
rf_tuned=RandomForestClassifier(random_state=42, max_depth=9, max_features=11, n_estimators=500)
rf_tuned.fit(X_train, y_train)

In [None]:
y_pred_cv=rf_tuned.predict(X_test)
accuracy_score(y_test,y_pred_cv)