In [13]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.naive_bayes import GaussianNB
import numpy as np

In [25]:
weatherAUS_data = pd.read_csv('C:/Users/Gangesh/Desktop/weatherAUS.csv')

# Dropping all Null value records
#weatherAUS_data = weatherAUS_data.dropna()
weatherAUS_data = weatherAUS_data.dropna(subset = ['RainTomorrow'])

weatherAUS_data = weatherAUS_data[weatherAUS_data['Rainfall'] <= 3.2] 
weatherAUS_data = weatherAUS_data[weatherAUS_data['Evaporation'] <= 22] 
weatherAUS_data = weatherAUS_data[weatherAUS_data['WindSpeed9am'] <= 55] 
weatherAUS_data = weatherAUS_data[weatherAUS_data['WindSpeed3pm'] <= 57] 


# Converting string values to Integers for processing
columns_to_be_converted = ["Location", "WindGustDir", "WindDir9am", "WindDir3pm", "RainToday", "RainTomorrow"]
column_int_dicts = dict()

for i in columns_to_be_converted:
    this_column_dict = dict()
    column_values = list(weatherAUS_data[i].unique())
    for j in range(len(column_values)):
        this_column_dict[column_values[j]] = j
    column_int_dicts[i] = this_column_dict

# Converting string values to Int values
for i in columns_to_be_converted:
    weatherAUS_data[i+"_Int"] = weatherAUS_data[i].apply(lambda x: column_int_dicts[i][x])

columns_to_be_considered_in_x = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'Location_Int', 'WindGustDir_Int', 'WindDir9am_Int', 'WindDir3pm_Int', 'RainToday_Int']

# X parameters
df_x = weatherAUS_data[columns_to_be_considered_in_x]

# Y target values
df_y = weatherAUS_data[['RainTomorrow_Int']]



In [26]:
continuous_cols = list(df_x.select_dtypes(include=['float64']).columns)
df_x[continuous_cols].isna().sum()

MinTemp           54
MaxTemp           39
Rainfall           0
Evaporation        0
Sunshine        8892
WindSpeed9am       0
WindSpeed3pm       0
Humidity9am      376
Humidity3pm      960
Pressure9am       94
Pressure3pm       79
Cloud9am        7850
Cloud3pm        9806
Temp9am           61
Temp3pm          596
dtype: int64

In [27]:
# Converting data parameters and target to numpy arrays
#X = df_x.to_numpy()
#y = df_y.to_numpy().ravel()

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state=511)

In [28]:
for column in continuous_cols:
    X_train[column].fillna(X_train[column].median(), inplace = True)
    X_test[column].fillna(X_test[column].median(), inplace = True)
    
# Checking missing values 
X_train.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


MinTemp            0
MaxTemp            0
Rainfall           0
Evaporation        0
Sunshine           0
WindSpeed9am       0
WindSpeed3pm       0
Humidity9am        0
Humidity3pm        0
Pressure9am        0
Pressure3pm        0
Cloud9am           0
Cloud3pm           0
Temp9am            0
Temp3pm            0
Location_Int       0
WindGustDir_Int    0
WindDir9am_Int     0
WindDir3pm_Int     0
RainToday_Int      0
dtype: int64

In [23]:
 #Removing outliers in certain continous columns
def upper_outlier(df, variable, top):
    return np.where(df[variable]>top, top, df[variable])
def lower_outlier(df, variable, bot):
    return np.where(df[variable]<bot, bot, df[variable])

for X_df in [X_train, X_test]:
    X_df['Rainfall'] = upper_outlier(X_df, 'Rainfall', 3.2)
    X_df['Rainfall'] = lower_outlier(X_df, 'Rainfall', -2.4)
    
    X_df['Evaporation'] = upper_outlier(X_df, 'Evaporation', 21.8)
    X_df['Evaporation'] = lower_outlier(X_df, 'Evaporation', -11.8)

    X_df['WindSpeed9am'] = upper_outlier(X_df, 'WindSpeed9am', 55.0)
    X_df['WindSpeed9am'] = lower_outlier(X_df, 'WindSpeed9am', -29.0)

    X_df['WindSpeed3pm'] = upper_outlier(X_df, 'WindSpeed3pm', 57.0)
    X_df['WindSpeed3pm'] = lower_outlier(X_df, 'WindSpeed3pm', -20.0)

X_train.describe().apply(round)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Location_Int,WindGustDir_Int,WindDir9am_Int,WindDir3pm_Int,RainToday_Int
count,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0,113754.0
mean,12.0,23.0,1.0,5.0,8.0,14.0,19.0,69.0,52.0,1018.0,1015.0,5.0,5.0,17.0,22.0,24.0,8.0,8.0,8.0,0.0
std,6.0,7.0,1.0,3.0,3.0,9.0,9.0,19.0,21.0,7.0,7.0,2.0,2.0,6.0,7.0,14.0,5.0,5.0,5.0,0.0
min,-8.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,982.0,977.0,0.0,0.0,-7.0,-5.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,18.0,0.0,4.0,8.0,7.0,13.0,57.0,37.0,1014.0,1011.0,3.0,4.0,12.0,17.0,11.0,4.0,4.0,4.0,0.0
50%,12.0,23.0,0.0,5.0,8.0,13.0,19.0,70.0,52.0,1018.0,1015.0,5.0,5.0,17.0,21.0,23.0,8.0,8.0,8.0,0.0
75%,17.0,28.0,1.0,5.0,9.0,19.0,24.0,83.0,65.0,1022.0,1019.0,6.0,6.0,22.0,26.0,36.0,12.0,12.0,12.0,0.0
max,34.0,48.0,3.0,22.0,14.0,55.0,57.0,100.0,100.0,1041.0,1040.0,9.0,9.0,40.0,47.0,48.0,16.0,16.0,16.0,2.0


In [29]:
# Scaling using MinMax Scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=[columns_to_be_considered_in_x])

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=[columns_to_be_considered_in_x])

X_train.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Location_Int,WindGustDir_Int,WindDir9am_Int,WindDir3pm_Int,RainToday_Int
count,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0,54815.0
mean,0.496992,0.473205,0.080073,0.251021,0.56745,0.266458,0.332179,0.646053,0.474166,0.599163,0.619038,0.491654,0.453661,0.468859,0.44887,0.506464,0.489453,0.474578,0.469179,0.092183
std,0.153391,0.164066,0.196267,0.159725,0.239652,0.155336,0.14613,0.18509,0.196232,0.119152,0.110802,0.332168,0.275061,0.161823,0.159932,0.291225,0.296422,0.297125,0.287352,0.289286
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.384248,0.344262,0.0,0.127273,0.433566,0.166667,0.22807,0.54,0.33,0.519231,0.542071,0.125,0.222222,0.344828,0.325472,0.25,0.25,0.25,0.25,0.0
50%,0.48926,0.461358,0.0,0.227273,0.622378,0.240741,0.333333,0.66,0.48,0.596154,0.616505,0.5,0.444444,0.458128,0.436321,0.53125,0.5,0.4375,0.4375,0.0
75%,0.610979,0.59719,0.0,0.345455,0.741259,0.351852,0.421053,0.77,0.61,0.678322,0.694175,0.875,0.777778,0.58867,0.566038,0.75,0.75,0.6875,0.6875,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(X_train,y_train)

#Predict the response for test dataset
y_pred=model.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.800642148277875


  y = column_or_1d(y, warn=True)


In [7]:
model.predict([[22.7, 33.1, 0., 5.8, 8.7, 7., 17., 76., 65., 1017.9, 950., 15., 7., 26.9, 29., 28., 14., 4., 2., 0.]])[0]
# 0 means Prediction of rain tomorrow is No
# 1 means Prediction of rain tomorrow is Yes -> 'RainTomorrow': {'No': 0, 'Yes': 1}

1

In [31]:
# Source - https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn
print(model.score(X_train,y_train))

0.800291890905774
