In [2]:
# Import libs
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import date
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict

In [3]:
# Load data file
df = pd.read_csv('seattleWeather_1948-2017.csv')

In [4]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25551 entries, 0 to 25550
Data columns (total 5 columns):
DATE    25551 non-null object
PRCP    25548 non-null float64
TMAX    25551 non-null int64
TMIN    25551 non-null int64
RAIN    25548 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 998.2+ KB


Unnamed: 0,PRCP,TMAX,TMIN
count,25548.0,25551.0,25551.0
mean,0.106222,59.544206,44.514226
std,0.239031,12.772984,8.892836
min,0.0,4.0,0.0
25%,0.0,50.0,38.0
50%,0.0,58.0,45.0
75%,0.1,69.0,52.0
max,5.02,103.0,71.0


In [5]:
df.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True


In [6]:
# Check for missing values
df.isnull().sum()

DATE    0
PRCP    3
TMAX    0
TMIN    0
RAIN    3
dtype: int64

In [7]:
# Remove rows where RAIN is null
df = df[df['RAIN'].notnull()]
df.isnull().sum()

DATE    0
PRCP    0
TMAX    0
TMIN    0
RAIN    0
dtype: int64

In [8]:
# Create new feature - delta of max temp and min temp
df['TDELTA'] = df['TMAX'] - df['TMIN']

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,TDELTA
0,1948-01-01,0.47,51,42,True,9
1,1948-01-02,0.59,45,36,True,9
2,1948-01-03,0.42,45,35,True,10
3,1948-01-04,0.31,45,34,True,11
4,1948-01-05,0.17,45,32,True,13


In [9]:
# Create new feature - total PRCP for the last 3 days
df['PRCP3D'] = df['PRCP'].rolling(3).sum()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,TDELTA,PRCP3D
0,1948-01-01,0.47,51,42,True,9,
1,1948-01-02,0.59,45,36,True,9,
2,1948-01-03,0.42,45,35,True,10,1.48
3,1948-01-04,0.31,45,34,True,11,1.32
4,1948-01-05,0.17,45,32,True,13,0.9


In [8]:
# Convert DATE column from string to datetime
df['DATE'] = pd.to_datetime(df['DATE'])

# Convert RAIN column from string to boolean
df['RAIN'] = df['RAIN'].astype(bool)

In [9]:
# Create TRAIN data set
train_df = df[(date(1950,1,1) <= df['DATE']) & (df['DATE'] < date(2010,1,1))]
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21912 entries, 731 to 22645
Data columns (total 6 columns):
DATE      21912 non-null datetime64[ns]
PRCP      21912 non-null float64
TMAX      21912 non-null int64
TMIN      21912 non-null int64
RAIN      21912 non-null bool
TDELTA    21912 non-null int64
dtypes: bool(1), datetime64[ns](1), float64(1), int64(3)
memory usage: 1.0 MB


In [10]:
# Create VALIDATION data set
validation_df = df[(date(2010,1,1) <= df['DATE']) & (df['DATE'] < date(2018,1,1))] 
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 22646 to 25550
Data columns (total 6 columns):
DATE      2905 non-null datetime64[ns]
PRCP      2905 non-null float64
TMAX      2905 non-null int64
TMIN      2905 non-null int64
RAIN      2905 non-null bool
TDELTA    2905 non-null int64
dtypes: bool(1), datetime64[ns](1), float64(1), int64(3)
memory usage: 139.0 KB


In [25]:
feature_cols = ['PRCP', 'TMAX', 'TMIN', 'TDELTA']
# Training set
X = train_df[feature_cols]
y = train_df.RAIN

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

# Validation set
X_validation = validation_df[feature_cols]
y_validation = validation_df.RAIN

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
y_pred = model.predict(X_test)
print (metrics.accuracy_score(y_test, y_pred))

0.9275282949981745


In [14]:
# Confusion matrix
metrics.confusion_matrix(y_test, y_pred)

array([[3195,   16],
       [ 381, 1886]])

In [28]:
# Evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())

[0.91788321 0.92928832 0.92153285 0.93567518 0.92971246 0.94294843
 0.92788681 0.92879963 0.92968037 0.9260274 ]
0.9289434654587181
