In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


def selectkbest(indep_x,dep_y,n):
    test=SelectKBest(score_func=chi2,k=n)
    fit1=test.fit(indep_x,dep_y)
    selectk_features=fit1.transform(indep_x)
    return selectk_features

def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test=train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def cm_prediction(classifier,x_test):
    y_pred = classifier.predict(x_test)
    # making the confusion matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    #from sklearn.metrics import confusion matrix
    #cm = confusion_matrix(y_test,y_pred)
    Accuracy = accuracy_score(y_test,y_pred)
    report = classification_report(y_test,y_pred)
    return classifier,Accuracy,report,x_test,y_test,cm

def logistic(x_train,y_train,x_test):
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def svm_linear(x_train,y_train,x_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def svm_NL(x_train,y_train,x_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def Naive(x_train,y_train,x_test):
    #Fitting K-NN to the Traing set
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def knn(x_train,y_train,x_test):
    #Fitting K-NN to the Traing set
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',p=2)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def Decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion= 'entropy' ,random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

def select_classification(acclog,accvml,accsvmnl,accknn,accnav,accdes,accrf):

    dataframe = pd.DataFrame(index=['ChiSquar'],columns=['Logistic','SVMl','SVMnl','KNN','Naive','Decision','Random'])
    for number,idex in enumerate(dataframe.index):
        dataframe['Logistic'][idex]=acclog[number]
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['Naive'][idex]=accnav[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [2]:
dataset1 = pd.read_csv("preprocessed_Rainfall_dataset.csv",index_col=None)
df2=dataset1
df2=pd.get_dummies(df2,dtype=int,drop_first=True)
indep_x=df2.drop("rainfall_yes",axis=1)
dep_y=df2["rainfall_yes"]

In [5]:
#df2.isnull().sum()

In [7]:
df2

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall_yes
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,9.3,80.0,26.3,1
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,0.6,50.0,15.3,1
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,0.0,40.0,14.2,1
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,1.0,50.0,16.9,1
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,0.0,40.0,13.7,1
...,...,...,...,...,...,...,...,...,...,...,...,...
361,27,1022.7,18.8,17.7,16.9,15.0,84,90,0.0,30.0,18.4,1
362,28,1026.6,18.6,17.3,16.3,12.8,75,85,1.0,20.0,25.9,1
363,29,1025.9,18.9,17.7,16.4,13.3,75,78,4.6,70.0,33.4,1
364,30,1025.3,19.2,17.3,15.2,13.3,78,86,1.2,20.0,20.9,1


In [69]:
from sklearn.feature_selection import SelectKBest, chi2

# Split into features and target
indep_x = df2.drop('rainfall_yes', axis=1)
dep_y = df2['rainfall_yes']

# Ensure all values in indep_x are non-negative (chi2 requires this)
indep_x[indep_x < 0] = 0

# Apply SelectKBest
def selectkbest(indep_x, dep_y, n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1 = test.fit(indep_x, dep_y)
    select_features = fit1.transform(indep_x)
    return select_features

# Now call the function
kbest = selectkbest(indep_x, dep_y, 6)

In [71]:
kbest

array([[19.9, 13.1, 72. , 49. ,  9.3, 26.3],
       [21.7, 15.6, 81. , 83. ,  0.6, 15.3],
       [20.3, 18.4, 95. , 91. ,  0. , 14.2],
       ...,
       [18.9, 13.3, 75. , 78. ,  4.6, 33.4],
       [19.2, 13.3, 78. , 86. ,  1.2, 20.9],
       [20.5, 13. , 74. , 66. ,  5.7, 23.3]])

In [73]:
# After fitting the SelectKBest object:
test = SelectKBest(score_func=chi2, k=6)
fit1 = test.fit(indep_x, dep_y)

# Get the mask of selected features
selected_mask = fit1.get_support()

# Get column names of selected features
selected_features = indep_x.columns[selected_mask]
print(selected_features)

Index(['maxtemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed'], dtype='object')


In [75]:
# Initialize accuracy lists
acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []

In [77]:
x_train,x_test,y_train,y_test = split_scalar(kbest,dep_y)

classifier,Accuracy,report,x_test,y_test,cm=logistic(x_train,y_train,x_test)
acclog.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=svm_linear(x_train,y_train,x_test)
accsvml.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=svm_NL(x_train,y_train,x_test)
accsvmnl.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=knn(x_train,y_train,x_test)
accknn.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=Naive(x_train,y_train,x_test)
accnav.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=Decision(x_train,y_train,x_test)
accdes.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=random(x_train,y_train,x_test)
accrf.append(Accuracy)

result = select_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

In [79]:
result
#6

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
ChiSquar,0.815217,0.847826,0.804348,0.782609,0.793478,0.706522,0.880435


In [43]:
result
#5

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
ChiSquar,0.815217,0.847826,0.804348,0.782609,0.804348,0.695652,0.836957


In [55]:
result
#4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
ChiSquar,0.836957,0.847826,0.826087,0.804348,0.782609,0.706522,0.826087


In [67]:
result
#3

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
ChiSquar,0.836957,0.847826,0.836957,0.847826,0.793478,0.728261,0.815217


In [21]:
import pickle

In [23]:
filename="finalaized_model_RandomForestClassifier.sav"

In [25]:
pickle.dump(classifier,open(filename,'wb'))

In [27]:
loaded_model=pickle.load(open("finalaized_model_RandomForestClassifier.sav",'rb'))

In [29]:
result = loaded_model.predict([[11.0,34.2,25.0,45,67,88]])

In [31]:
result

array([1])