In [1]:
import pandas as pd

# Load data file
df = pd.read_csv('seattleWeather_1948-2017.csv')

In [2]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25551 entries, 0 to 25550
Data columns (total 5 columns):
DATE    25551 non-null object
PRCP    25548 non-null float64
TMAX    25551 non-null int64
TMIN    25551 non-null int64
RAIN    25548 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 998.2+ KB


Unnamed: 0,PRCP,TMAX,TMIN
count,25548.0,25551.0,25551.0
mean,0.106222,59.544206,44.514226
std,0.239031,12.772984,8.892836
min,0.0,4.0,0.0
25%,0.0,50.0,38.0
50%,0.0,58.0,45.0
75%,0.1,69.0,52.0
max,5.02,103.0,71.0


In [3]:
df.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True


In [4]:
# Check for missing values
df.isnull().sum()

DATE    0
PRCP    3
TMAX    0
TMIN    0
RAIN    3
dtype: int64

In [5]:
# Remove rows where RAIN is null
df = df[df['RAIN'].notnull()]
df.isnull().sum()

DATE    0
PRCP    0
TMAX    0
TMIN    0
RAIN    0
dtype: int64

In [6]:
# Create new feature - total PRCP for the last 3 days
df['PRCP3D'] = df['PRCP'].rolling(3).sum()

In [7]:
# Create new feature - Yesterday's PRCP
df['PRCPYDAY'] = df['PRCP'].shift(1)

In [8]:
df.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,PRCP3D,PRCPYDAY
0,1948-01-01,0.47,51,42,True,,
1,1948-01-02,0.59,45,36,True,,0.47
2,1948-01-03,0.42,45,35,True,1.48,0.59
3,1948-01-04,0.31,45,34,True,1.32,0.42
4,1948-01-05,0.17,45,32,True,0.9,0.31


In [9]:
# Check for missing values
df.isnull().sum()

DATE        0
PRCP        0
TMAX        0
TMIN        0
RAIN        0
PRCP3D      2
PRCPYDAY    1
dtype: int64

In [10]:
# Remove null values for newly created columns
df = df[df['PRCPYDAY'].notnull()]
df = df[df['PRCP3D'].notnull()]
df.isnull().sum()

DATE        0
PRCP        0
TMAX        0
TMIN        0
RAIN        0
PRCP3D      0
PRCPYDAY    0
dtype: int64

In [11]:
from datetime import date

# Convert DATE column from string to datetime
df['DATE'] = pd.to_datetime(df['DATE'])

# Convert RAIN column from string to boolean
df['RAIN'] = df['RAIN'].astype(bool)

In [12]:
# Create TRAIN data set
train_df = df[(date(1950,1,1) <= df['DATE']) & (df['DATE'] < date(2010,1,1))]
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21912 entries, 731 to 22645
Data columns (total 7 columns):
DATE        21912 non-null datetime64[ns]
PRCP        21912 non-null float64
TMAX        21912 non-null int64
TMIN        21912 non-null int64
RAIN        21912 non-null bool
PRCP3D      21912 non-null float64
PRCPYDAY    21912 non-null float64
dtypes: bool(1), datetime64[ns](1), float64(3), int64(2)
memory usage: 1.2 MB


In [13]:
# Create VALIDATION data set
validation_df = df[(date(2010,1,1) <= df['DATE']) & (df['DATE'] < date(2018,1,1))] 
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 22646 to 25550
Data columns (total 7 columns):
DATE        2905 non-null datetime64[ns]
PRCP        2905 non-null float64
TMAX        2905 non-null int64
TMIN        2905 non-null int64
RAIN        2905 non-null bool
PRCP3D      2905 non-null float64
PRCPYDAY    2905 non-null float64
dtypes: bool(1), datetime64[ns](1), float64(3), int64(2)
memory usage: 161.7 KB


In [14]:
from sklearn.model_selection import train_test_split

feature_cols = ['TMAX', 'TMIN', 'PRCP3D', 'PRCPYDAY']
# Training set
X = train_df[feature_cols]
y = train_df.RAIN

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

# Validation set
X_validation = validation_df[feature_cols]
y_validation = validation_df.RAIN

In [15]:
from sklearn.linear_model import LogisticRegressionCV

# Use Model: Logistic Regression Cross-Validation
model = LogisticRegressionCV()
model.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [16]:
# Coefficients
print(list(zip(feature_cols, model.coef_[0])))

[('TMAX', -0.2081147627712993), ('TMIN', 0.2297881794011777), ('PRCP3D', 3.7498034209915487), ('PRCPYDAY', -2.7432301444749605)]


In [17]:
# Predict using Test set
from sklearn import metrics

y_pred = model.predict(X_test)
print (metrics.accuracy_score(y_test, y_pred))

0.7997444322745527


In [18]:
# Confusion matrix
metrics.confusion_matrix(y_test, y_pred)

array([[2743,  468],
       [ 629, 1638]])

In [19]:
# Predict using Validation set
y_val_pred = model.predict(X_validation)
print (metrics.accuracy_score(y_validation, y_val_pred))

0.8151462994836489


In [20]:
# Confusion matrix
metrics.confusion_matrix(y_validation, y_val_pred)

array([[1361,  244],
       [ 293, 1007]])