In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
%matplotlib inline


#Read data file into a dataframe
df = pd.read_csv("weatherHistory.csv")
df = df.dropna()

#Drop unwanted columns
df = df.drop(['Formatted Date', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover', 'Apparent Temperature (C)', 'Daily Summary', 'Precip Type'], axis=1)
df


Unnamed: 0,Summary,Temperature (C),Humidity,Pressure (millibars)
0,Partly Cloudy,9.472222,0.89,1015.13
1,Partly Cloudy,9.355556,0.86,1015.63
2,Mostly Cloudy,9.377778,0.89,1015.94
3,Partly Cloudy,8.288889,0.83,1016.41
4,Mostly Cloudy,8.755556,0.83,1016.51
...,...,...,...,...
96448,Partly Cloudy,26.016667,0.43,1014.36
96449,Partly Cloudy,24.583333,0.48,1015.16
96450,Partly Cloudy,22.038889,0.56,1015.66
96451,Partly Cloudy,21.522222,0.60,1015.95


In [4]:
# Select the features (temperature, humidity, and pressure) and target (precip type) columns
X = df[['Temperature (C)', 'Humidity', 'Pressure (millibars)']]
y = df['Summary']

# Encode the categorical column using LabelEncoder
enc = LabelEncoder()
y_enc = enc.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42)

# Create a Logistic Regression model and train it on the training data
clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)

# Predict the classes of the test set using the trained model
y_pred_enc = clf.predict(X_test)

# Decode the predicted classes back to their original categories
y_pred = enc.inverse_transform(y_pred_enc)

# Evaluate the performance of the model using a confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred_enc)
acc = accuracy_score(y_test, y_pred_enc)
print('Confusion Matrix:\n', cm)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(acc))





Confusion Matrix:
 [[   0    0    0    0    0    0    0    0    0    0    1    0    0    0
     0    2    2    4    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    3    0    0    0
     0    0    2    1    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    3    0    0    0
     0   46    5   41    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    4    0    0    0
     0   69   14   21    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    1    0    0    0
     0   36    0   40    0    0    0    0    0]
 [   0    0    0    0    0   18    0    0    0    0  170    0    0    0
     0  610  193 1170    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    6    0    2    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    7    0    0    0    0    0]
 [   0    0    0    0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
import pickle

pickle.dump(clf, open('model.pkl', 'wb'))