In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Create a Spark Session
spark = SparkSession.builder.appName("LOGISTIC").getOrCreate()

In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('weather_august.csv')
# Create a Pandas Dataframe
aug_df = spark.createDataFrame(df)
# Convert the PySpark DataFrame to a Pandas DataFrame
aug_df_pd = aug_df.toPandas()

In [4]:
# Transform 'Precipitation' column into qualitative variable
aug_df_pd['Precipitation'] = df['Precipitation'].apply(lambda x: 1 if x > 0 else 0)

# Show the Dataframe
aug_df_pd.head()

Unnamed: 0,Precipitation,Global_Radiation,Avarage_Atmospheric_Pressure,Avarage_Temperature,Avarage_Dew_Temperature,Avarage_Relative_Humidity,Avarage_Wind_Speed
0,0,0.1,929.3,12.15,12.1,100.0,2.7
1,0,5.0,929.75,12.35,12.35,100.0,2.45
2,0,135.8,930.2,12.55,12.55,100.0,2.5
3,0,488.0,930.4,13.35,13.35,100.0,2.9
4,0,1631.1,930.6,15.5,13.65,89.5,3.05


In [5]:
# Table of absolute frequencies of the 'Precipitation' variable
aug_df_pd['Precipitation'].value_counts()

0    349
1     26
Name: Precipitation, dtype: int64

In [6]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Separate features (X) and target variable (y)
X = aug_df_pd.drop('Precipitation', axis=1)
y = aug_df_pd['Precipitation']

# Encode the 'Precipitation' column to numeric values (0 for 'No', 1 for 'Yes')
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
kfold_scores_rf = scores.mean()

In [7]:
# XGBoost
import xgboost as xgb

# Assuming your data is in a DataFrame named 'df'
# Separate features (X) and target variable (y)
X = aug_df_pd.drop('Precipitation', axis=1)
y = aug_df_pd['Precipitation']

# Encode the 'Precipitation' column to numeric values (0 for 'No', 1 for 'Yes')
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Perform k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy_xg = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
kfold_scores_xg = scores.mean()

In [9]:
# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assuming your data is in a DataFrame named 'df'
# Separate features (X) and target variable (y)
X = aug_df_pd.drop('Precipitation', axis=1)
y = aug_df_pd['Precipitation']

# Encode the 'Precipitation' column to numeric values (0 for 'No', 1 for 'Yes')
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Sequential neural network model
model = Sequential()

# Add input layer and hidden layers
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))

# Add output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the training data
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Make predictions on the testing data
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Evaluate the model
accuracy_nn = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Separate features (X) and target variable (y)
X = aug_df_pd.drop('Precipitation', axis=1)
y = aug_df_pd['Precipitation']

# Encode the 'Precipitation' column to numeric values (0 for 'No', 1 for 'Yes')
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Define the neural network model as a function
def create_nn_model():
    model = Sequential()
    model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Initialize a StratifiedKFold object for cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize an empty list to store cross-validated accuracy scores
cv_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Create a new neural network model for each fold
    model = create_nn_model()
    
    # Train the model on the training data for this fold
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    
    # Make predictions on the testing data for this fold
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    
    # Calculate and store the accuracy for this fold
    fold_accuracy = accuracy_score(y_test, y_pred)
    cv_scores.append(fold_accuracy)

# Calculate the mean using numpy
kfold_scores_nn = np.mean(cv_scores)



AttributeError: 'list' object has no attribute 'mean'

In [None]:
# Create a dictionary with the variables
data = {
    'Model': ['Random Forest', 'XGBoost', 'Neural Network'],
    'Accuracy': [accuracy_rf, accuracy_xg, accuracy_nn].
    'K-fold': [kfold_scores_rf, kfold_scores_xg, kfold_scores_nn]
}

# Create a DataFrame from the dictionary
evaluate_dt = pd.DataFrame(data)

evaluate_dt

In [None]:
cv_scores