# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Import data

In [2]:
data = pd.read_csv("dataset/weatherAUS.csv")
data.head(2)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No


In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include='all')

In [None]:
data.RainTomorrow.isna().sum()

### RainTomorrow is target column

In [None]:
data.dropna(inplace=True, subset=['RainTomorrow'])

### Date column

In [3]:
data['Date'] = pd.to_datetime(data['Date'])
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
data['Year'] = data['Date'].dt.year
data['Year'].head()

0    2008
1    2008
2    2008
3    2008
4    2008
Name: Year, dtype: int64

In [5]:
data.drop(['Date'], axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No,2008
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,No,2008
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No,2008
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,16.0,1017.6,1012.8,,,18.1,26.5,No,No,2008
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,2008


In [13]:
num_cols = data.select_dtypes(include=np.number).columns
num_cols = list(num_cols)
# num_cols

In [16]:
cat_cols = data.select_dtypes('O').columns.to_list()
# cat_cols

In [22]:
# Handling missing values in numberical columns
data[num_cols].isna().sum()

MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustSpeed    10263
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
Year                 0
dtype: int64

In [23]:
from sklearn.impute import SimpleImputer


In [24]:
?SimpleImputer

In [25]:
imputer = SimpleImputer(strategy='mean')

In [27]:
imputer.fit(data[num_cols])

In [28]:
imputer.statistics_

array([  12.19403438,   23.22134828,    2.36091815,    5.46823152,
          7.61117752,   40.03523007,   14.04342591,   18.66265678,
         68.88083134,   51.53911588, 1017.6499398 , 1015.25588883,
          4.44746126,    4.50993008,   16.99063142,   21.68339032,
       2012.76975113])

In [29]:
imputer.feature_names_in_

array(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am',
       'Cloud3pm', 'Temp9am', 'Temp3pm', 'Year'], dtype=object)

In [32]:
# imputer.set_params()

In [36]:
data[num_cols] = imputer.fit_transform(data[num_cols])

In [37]:
data[num_cols].isna().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
Year             0
dtype: int64

### Handling missing values in cat_columns

In [39]:
data[cat_cols].isna().sum()

Location            0
WindGustDir     10326
WindDir9am      10566
WindDir3pm       4228
RainToday        3261
RainTomorrow     3267
dtype: int64

In [42]:
imputer = SimpleImputer(strategy='most_frequent')

In [43]:
imputer.fit(data[cat_cols])

In [44]:
imputer.statistics_

array(['Canberra', 'W', 'N', 'SE', 'No', 'No'], dtype=object)

In [46]:
imputer.get_feature_names_out()

array(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday',
       'RainTomorrow'], dtype=object)

In [48]:
data[cat_cols] = imputer.fit_transform(data[cat_cols])
data[cat_cols].isna().sum()

Location        0
WindGustDir     0
WindDir9am      0
WindDir3pm      0
RainToday       0
RainTomorrow    0
dtype: int64

In [49]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year
0,Albury,13.4,22.9,0.6,5.468232,7.611178,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,4.50993,16.9,21.8,No,No,2008.0
1,Albury,7.4,25.1,0.0,5.468232,7.611178,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,4.447461,4.50993,17.2,24.3,No,No,2008.0
2,Albury,12.9,25.7,0.0,5.468232,7.611178,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,4.447461,2.0,21.0,23.2,No,No,2008.0
3,Albury,9.2,28.0,0.0,5.468232,7.611178,NE,24.0,SE,E,...,16.0,1017.6,1012.8,4.447461,4.50993,18.1,26.5,No,No,2008.0
4,Albury,17.5,32.3,1.0,5.468232,7.611178,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,2008.0


## Testing and Training

In [50]:
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import train_test_split

In [None]:
# # training, testing, validating
# trainging -> 60%
# testing -> 20%
# validating -> 20%

In [51]:
data.shape

(145460, 23)

In [54]:
train_val_df , test_df = train_test_split(data, test_size=.2)
train_val_df.shape, test_df.shape

((116368, 23), (29092, 23))

In [56]:
train_df, val_df = train_test_split(train_val_df, test_size=.2)
train_df.shape, val_df.shape

((93094, 23), (23274, 23))

In [57]:
# train_val_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
# train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

In [58]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

train_df.shape : (93094, 23)
val_df.shape : (23274, 23)
test_df.shape : (29092, 23)


In [61]:
# data.columns
# data[data.Year>2015]

However, while working with dates, it's often a better idea to separate the training, validation and test sets with time, so that the model is trained on data from the past and evaluated on data from the future.

In [62]:
train_df = data[data.Year < 2015]
val_df = data[data.Year == 2015]
test_df = data[data.Year > 2015]

In [63]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

train_df.shape : (101018, 23)
val_df.shape : (17885, 23)
test_df.shape : (26557, 23)


While not a perfect 60-20-20 split, we have ensured that the test validation and test sets both contain data for all 12 months of the year.

## Identifying Input and Target Columns

In [64]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year
0,Albury,13.4,22.9,0.6,5.468232,7.611178,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,4.50993,16.9,21.8,No,No,2008.0
1,Albury,7.4,25.1,0.0,5.468232,7.611178,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,4.447461,4.50993,17.2,24.3,No,No,2008.0
2,Albury,12.9,25.7,0.0,5.468232,7.611178,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,4.447461,2.0,21.0,23.2,No,No,2008.0
3,Albury,9.2,28.0,0.0,5.468232,7.611178,NE,24.0,SE,E,...,16.0,1017.6,1012.8,4.447461,4.50993,18.1,26.5,No,No,2008.0
4,Albury,17.5,32.3,1.0,5.468232,7.611178,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,2008.0


In [65]:
# train_d

Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow', 'Year'],
      dtype='object')

In [66]:
input_cols = list(train_df.columns.drop(['RainTomorrow']))
target_col = 'RainTomorrow'

In [67]:
input_cols

['Location',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday',
 'Year']

In [69]:
# train_df[target_col]

0         No
1         No
2         No
3         No
4         No
          ..
144548    No
144549    No
144550    No
144551    No
144552    No
Name: RainTomorrow, Length: 101018, dtype: object

In [70]:
# import copy
# copy.deepcopy()

In [71]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [72]:
train_inputs.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year
0,Albury,13.4,22.9,0.6,5.468232,7.611178,W,44.0,W,WNW,...,71.0,22.0,1007.7,1007.1,8.0,4.50993,16.9,21.8,No,2008.0
1,Albury,7.4,25.1,0.0,5.468232,7.611178,WNW,44.0,NNW,WSW,...,44.0,25.0,1010.6,1007.8,4.447461,4.50993,17.2,24.3,No,2008.0
2,Albury,12.9,25.7,0.0,5.468232,7.611178,WSW,46.0,W,WSW,...,38.0,30.0,1007.6,1008.7,4.447461,2.0,21.0,23.2,No,2008.0
3,Albury,9.2,28.0,0.0,5.468232,7.611178,NE,24.0,SE,E,...,45.0,16.0,1017.6,1012.8,4.447461,4.50993,18.1,26.5,No,2008.0
4,Albury,17.5,32.3,1.0,5.468232,7.611178,W,41.0,ENE,NW,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,2008.0


In [73]:
train_targets.head()

0    No
1    No
2    No
3    No
4    No
Name: RainTomorrow, dtype: object

In [74]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [75]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [89]:
# train_targets

0         No
1         No
2         No
3         No
4         No
          ..
144548    No
144549    No
144550    No
144551    No
144552    No
Name: RainTomorrow, Length: 101018, dtype: object

In [76]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.to_list()
numeric_cols

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'Year']

In [77]:
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()
categorical_cols

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [78]:
train_inputs[numeric_cols].head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year
0,13.4,22.9,0.6,5.468232,7.611178,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,4.50993,16.9,21.8,2008.0
1,7.4,25.1,0.0,5.468232,7.611178,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,4.447461,4.50993,17.2,24.3,2008.0
2,12.9,25.7,0.0,5.468232,7.611178,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,4.447461,2.0,21.0,23.2,2008.0
3,9.2,28.0,0.0,5.468232,7.611178,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,4.447461,4.50993,18.1,26.5,2008.0
4,17.5,32.3,1.0,5.468232,7.611178,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,2008.0


In [79]:
train_inputs[numeric_cols].describe()


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year
count,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0
mean,11.993708,22.982556,2.381165,5.353008,7.601115,40.248486,14.126682,18.776147,68.697168,51.552246,1017.533168,1015.154838,4.364737,4.454915,16.814342,21.507842,2011.440318
std,6.313387,6.975608,8.43254,3.108422,2.897064,13.201832,8.943325,8.817635,18.831528,20.584602,6.723903,6.652154,2.281705,2.125898,6.358405,6.799765,1.801368
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,979.0,0.0,0.0,-5.9,-5.1,2007.0
25%,7.5,17.9,0.0,3.8,7.5,31.0,7.0,13.0,57.0,37.0,1013.4,1011.0,3.0,3.0,12.2,16.6,2010.0
50%,11.8,22.4,0.0,5.468232,7.611178,39.0,13.0,19.0,69.0,52.0,1017.64994,1015.255889,4.447461,4.50993,16.6,21.0,2011.0
75%,16.5,27.8,1.0,5.468232,9.2,46.0,19.0,24.0,83.0,65.0,1021.7,1019.3,6.0,6.0,21.2,26.1,2013.0
max,33.9,48.1,371.0,82.4,14.3,135.0,87.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.1,2014.0


In [80]:
train_inputs[categorical_cols].head()

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday
0,Albury,W,W,WNW,No
1,Albury,WNW,NNW,WSW,No
2,Albury,WSW,W,WSW,No
3,Albury,NE,SE,E,No
4,Albury,W,ENE,NW,No


In [81]:
train_inputs[categorical_cols].nunique()

Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64

In [83]:
# train_inputs.isna().sum()

## Imputing Missing Numeric Data

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='mean')

### Checking numeric values having null

In [None]:
data[numeric_cols].isna().sum()

In [None]:
train_inputs[numeric_cols].isna().sum()

In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
val_inputs[numeric_cols].isna().sum()

In [None]:
imputer.fit(data[numeric_cols])

In [None]:
list(imputer.statistics_)

In [None]:
pd.DataFrame({"col_name":numeric_cols, "statistics_":list(imputer.statistics_)})

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
train_inputs[numeric_cols].head()

In [None]:
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
val_inputs[numeric_cols].head()

In [None]:
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])
test_inputs[numeric_cols].head()

In [None]:
train_inputs[numeric_cols].isna().sum()

## Impute Categorical Missing Data

In [None]:
imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_cat.fit(data[categorical_cols])

In [None]:
list(imputer_cat.statistics_)

In [None]:
train_inputs[categorical_cols] = imputer_cat.transform(train_inputs[categorical_cols])
val_inputs[categorical_cols] = imputer_cat.transform(val_inputs[categorical_cols])
test_inputs[categorical_cols] = imputer_cat.transform(test_inputs[categorical_cols])

In [None]:
train_inputs[categorical_cols].isna().sum()

In [None]:
data[categorical_cols] = imputer_cat.transform(data[categorical_cols])
data[categorical_cols].head()

## FE: Scaling Numeric Features

In [84]:
train_inputs[numeric_cols].describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year
count,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0
mean,11.993708,22.982556,2.381165,5.353008,7.601115,40.248486,14.126682,18.776147,68.697168,51.552246,1017.533168,1015.154838,4.364737,4.454915,16.814342,21.507842,2011.440318
std,6.313387,6.975608,8.43254,3.108422,2.897064,13.201832,8.943325,8.817635,18.831528,20.584602,6.723903,6.652154,2.281705,2.125898,6.358405,6.799765,1.801368
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,979.0,0.0,0.0,-5.9,-5.1,2007.0
25%,7.5,17.9,0.0,3.8,7.5,31.0,7.0,13.0,57.0,37.0,1013.4,1011.0,3.0,3.0,12.2,16.6,2010.0
50%,11.8,22.4,0.0,5.468232,7.611178,39.0,13.0,19.0,69.0,52.0,1017.64994,1015.255889,4.447461,4.50993,16.6,21.0,2011.0
75%,16.5,27.8,1.0,5.468232,9.2,46.0,19.0,24.0,83.0,65.0,1021.7,1019.3,6.0,6.0,21.2,26.1,2013.0
max,33.9,48.1,371.0,82.4,14.3,135.0,87.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.1,2014.0


In [85]:
from sklearn.preprocessing import MinMaxScaler

In [86]:
?MinMaxScaler

In [87]:
scaler = MinMaxScaler()

In [91]:
scaler.fit(data[numeric_cols])
# scaler.f

In [90]:
print('Minimum:')
list(scaler.data_min_)

Minimum:


[-8.5,
 -4.8,
 0.0,
 0.0,
 0.0,
 6.0,
 0.0,
 0.0,
 0.0,
 0.0,
 980.5,
 977.1,
 0.0,
 0.0,
 -7.2,
 -5.4,
 2007.0]

In [92]:
print('Maximum:')
list(scaler.data_max_)

Maximum:


[33.9,
 48.1,
 371.0,
 145.0,
 14.5,
 135.0,
 130.0,
 87.0,
 100.0,
 100.0,
 1041.0,
 1039.6,
 9.0,
 9.0,
 40.2,
 46.7,
 2017.0]

In [94]:
pd.DataFrame({"col_name":numeric_cols, 
              "maximum":list(scaler.data_max_),
              "minimum":list(scaler.data_min_)})

Unnamed: 0,col_name,maximum,minimum
0,MinTemp,33.9,-8.5
1,MaxTemp,48.1,-4.8
2,Rainfall,371.0,0.0
3,Evaporation,145.0,0.0
4,Sunshine,14.5,0.0
5,WindGustSpeed,135.0,6.0
6,WindSpeed9am,130.0,0.0
7,WindSpeed3pm,87.0,0.0
8,Humidity9am,100.0,0.0
9,Humidity3pm,100.0,0.0


In [96]:
# train_inputs.head()

In [97]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [98]:
train_inputs[numeric_cols].head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year
0,0.516509,0.523629,0.001617,0.037712,0.524909,0.294574,0.153846,0.275862,0.71,0.22,0.449587,0.48,0.888889,0.501103,0.508439,0.522073,0.1
1,0.375,0.565217,0.0,0.037712,0.524909,0.294574,0.030769,0.252874,0.44,0.25,0.497521,0.4912,0.494162,0.501103,0.514768,0.570058,0.1
2,0.504717,0.57656,0.0,0.037712,0.524909,0.310078,0.146154,0.298851,0.38,0.3,0.447934,0.5056,0.494162,0.222222,0.594937,0.548944,0.1
3,0.417453,0.620038,0.0,0.037712,0.524909,0.139535,0.084615,0.103448,0.45,0.16,0.613223,0.5712,0.494162,0.501103,0.533755,0.612284,0.1
4,0.613208,0.701323,0.002695,0.037712,0.524909,0.271318,0.053846,0.229885,0.82,0.33,0.500826,0.4624,0.777778,0.888889,0.527426,0.673704,0.1


In [99]:
train_inputs[numeric_cols].describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Year
count,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0,101018.0
mean,0.483342,0.52519,0.006418,0.036917,0.524215,0.265492,0.108667,0.215818,0.686972,0.515522,0.612118,0.608877,0.484971,0.494991,0.506632,0.516465,0.444032
std,0.148901,0.131864,0.022729,0.021437,0.199797,0.10234,0.068795,0.101352,0.188315,0.205846,0.111139,0.106434,0.253523,0.236211,0.134144,0.130514,0.180137
min,0.0,0.013233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0304,0.0,0.0,0.027426,0.005758,0.0
25%,0.377358,0.429112,0.0,0.026207,0.517241,0.193798,0.053846,0.149425,0.57,0.37,0.543802,0.5424,0.333333,0.333333,0.409283,0.422265,0.3
50%,0.478774,0.514178,0.0,0.037712,0.524909,0.255814,0.1,0.218391,0.69,0.52,0.614049,0.610494,0.494162,0.501103,0.50211,0.506718,0.4
75%,0.589623,0.616257,0.002695,0.037712,0.634483,0.310078,0.146154,0.275862,0.83,0.65,0.680992,0.6752,0.666667,0.666667,0.599156,0.604607,0.6
max,1.0,1.0,1.0,0.568276,0.986207,1.0,0.669231,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.988484,0.7


## Encoding Categorical Data

In [100]:
data[categorical_cols].nunique()

Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64

In [101]:
from sklearn.preprocessing import OneHotEncoder

In [102]:
?OneHotEncoder

In [103]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder

In [104]:
train_inputs[categorical_cols].isna().sum()

Location       0
WindGustDir    0
WindDir9am     0
WindDir3pm     0
RainToday      0
dtype: int64

In [105]:
encoder.fit_transform(train_inputs[categorical_cols])

array([[0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [106]:
encoder.categories_

[array(['Adelaide', 'Albany', 'Albury', 'AliceSprings', 'BadgerysCreek',
        'Ballarat', 'Bendigo', 'Brisbane', 'Cairns', 'Canberra', 'Cobar',
        'CoffsHarbour', 'Dartmoor', 'Darwin', 'GoldCoast', 'Hobart',
        'Katherine', 'Launceston', 'Melbourne', 'MelbourneAirport',
        'Mildura', 'Moree', 'MountGambier', 'MountGinini', 'Newcastle',
        'Nhil', 'NorahHead', 'NorfolkIsland', 'Nuriootpa', 'PearceRAAF',
        'Penrith', 'Perth', 'PerthAirport', 'Portland', 'Richmond', 'Sale',
        'SalmonGums', 'Sydney', 'SydneyAirport', 'Townsville',
        'Tuggeranong', 'Uluru', 'WaggaWagga', 'Walpole', 'Watsonia',
        'Williamtown', 'Witchcliffe', 'Wollongong', 'Woomera'],
       dtype=object),
 array(['E', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NW', 'S', 'SE', 'SSE',
        'SSW', 'SW', 'W', 'WNW', 'WSW'], dtype=object),
 array(['E', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NW', 'S', 'SE', 'SSE',
        'SSW', 'SW', 'W', 'WNW', 'WSW'], dtype=object),
 array(['E', 

In [107]:
encoder.get_feature_names(categorical_cols)

array(['Location_Adelaide', 'Location_Albany', 'Location_Albury',
       'Location_AliceSprings', 'Location_BadgerysCreek',
       'Location_Ballarat', 'Location_Bendigo', 'Location_Brisbane',
       'Location_Cairns', 'Location_Canberra', 'Location_Cobar',
       'Location_CoffsHarbour', 'Location_Dartmoor', 'Location_Darwin',
       'Location_GoldCoast', 'Location_Hobart', 'Location_Katherine',
       'Location_Launceston', 'Location_Melbourne',
       'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree',
       'Location_MountGambier', 'Location_MountGinini',
       'Location_Newcastle', 'Location_Nhil', 'Location_NorahHead',
       'Location_NorfolkIsland', 'Location_Nuriootpa',
       'Location_PearceRAAF', 'Location_Penrith', 'Location_Perth',
       'Location_PerthAirport', 'Location_Portland', 'Location_Richmond',
       'Location_Sale', 'Location_SalmonGums', 'Location_Sydney',
       'Location_SydneyAirport', 'Location_Townsville',
       'Location_Tuggeranong', 

In [108]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))
print(encoded_cols)

['Location_Adelaide', 'Location_Albany', 'Location_Albury', 'Location_AliceSprings', 'Location_BadgerysCreek', 'Location_Ballarat', 'Location_Bendigo', 'Location_Brisbane', 'Location_Cairns', 'Location_Canberra', 'Location_Cobar', 'Location_CoffsHarbour', 'Location_Dartmoor', 'Location_Darwin', 'Location_GoldCoast', 'Location_Hobart', 'Location_Katherine', 'Location_Launceston', 'Location_Melbourne', 'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree', 'Location_MountGambier', 'Location_MountGinini', 'Location_Newcastle', 'Location_Nhil', 'Location_NorahHead', 'Location_NorfolkIsland', 'Location_Nuriootpa', 'Location_PearceRAAF', 'Location_Penrith', 'Location_Perth', 'Location_PerthAirport', 'Location_Portland', 'Location_Richmond', 'Location_Sale', 'Location_SalmonGums', 'Location_Sydney', 'Location_SydneyAirport', 'Location_Townsville', 'Location_Tuggeranong', 'Location_Uluru', 'Location_WaggaWagga', 'Location_Walpole', 'Location_Watsonia', 'Location_Williamtown', 'Loca

In [109]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [110]:
pd.set_option('display.max_columns', None)

In [111]:
test_inputs

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Location_Adelaide,Location_Albany,Location_Albury,Location_AliceSprings,Location_BadgerysCreek,Location_Ballarat,Location_Bendigo,Location_Brisbane,Location_Cairns,Location_Canberra,Location_Cobar,Location_CoffsHarbour,Location_Dartmoor,Location_Darwin,Location_GoldCoast,Location_Hobart,Location_Katherine,Location_Launceston,Location_Melbourne,Location_MelbourneAirport,Location_Mildura,Location_Moree,Location_MountGambier,Location_MountGinini,Location_Newcastle,Location_Nhil,Location_NorahHead,Location_NorfolkIsland,Location_Nuriootpa,Location_PearceRAAF,Location_Penrith,Location_Perth,Location_PerthAirport,Location_Portland,Location_Richmond,Location_Sale,Location_SalmonGums,Location_Sydney,Location_SydneyAirport,Location_Townsville,Location_Tuggeranong,Location_Uluru,Location_WaggaWagga,Location_Walpole,Location_Watsonia,Location_Williamtown,Location_Witchcliffe,Location_Wollongong,Location_Woomera,WindGustDir_E,WindGustDir_ENE,WindGustDir_ESE,WindGustDir_N,WindGustDir_NE,WindGustDir_NNE,WindGustDir_NNW,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW,WindDir9am_E,WindDir9am_ENE,WindDir9am_ESE,WindDir9am_N,WindDir9am_NE,WindDir9am_NNE,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW,WindDir3pm_E,WindDir3pm_ENE,WindDir3pm_ESE,WindDir3pm_N,WindDir3pm_NE,WindDir3pm_NNE,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
2498,Albury,0.681604,0.801512,0.000000,0.037712,0.524909,ENE,0.372093,N,ESE,0.000000,0.080460,0.46,0.17,0.543802,0.5136,0.777778,0.333333,0.702532,0.808061,No,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2499,Albury,0.693396,0.725898,0.001078,0.037712,0.524909,SSE,0.341085,SSE,SE,0.069231,0.195402,0.54,0.30,0.505785,0.5008,0.888889,0.888889,0.675105,0.712092,No,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2500,Albury,0.634434,0.527410,0.005930,0.037712,0.524909,ENE,0.325581,ESE,ENE,0.084615,0.448276,0.62,0.67,0.553719,0.6032,0.888889,0.888889,0.611814,0.477927,Yes,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2501,Albury,0.608491,0.538752,0.042049,0.037712,0.524909,SSE,0.255814,SE,SSE,0.069231,0.195402,0.74,0.65,0.618182,0.6304,0.888889,0.888889,0.556962,0.518234,Yes,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2502,Albury,0.566038,0.523629,0.018329,0.037712,0.524909,ENE,0.193798,SE,SSE,0.046154,0.103448,0.92,0.63,0.591736,0.5888,0.888889,0.888889,0.514768,0.529750,Yes,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,Uluru,0.266509,0.533081,0.000000,0.037712,0.524909,E,0.193798,SE,ENE,0.100000,0.126437,0.51,0.24,0.728926,0.6912,0.494162,0.501103,0.364979,0.533589,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
145456,Uluru,0.285377,0.568998,0.000000,0.037712,0.524909,NNW,0.124031,SE,N,0.100000,0.103448,0.56,0.21,0.710744,0.6720,0.494162,0.501103,0.381857,0.573896,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
145457,Uluru,0.327830,0.599244,0.000000,0.037712,0.524909,N,0.240310,SE,WNW,0.069231,0.103448,0.53,0.24,0.669421,0.6352,0.494162,0.501103,0.415612,0.604607,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
145458,Uluru,0.384434,0.601134,0.000000,0.037712,0.524909,SE,0.170543,SSE,N,0.100000,0.080460,0.51,0.24,0.642975,0.6304,0.333333,0.222222,0.470464,0.602687,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Training

In [113]:
from sklearn.linear_model import LogisticRegression
?LogisticRegression

In [114]:
model = LogisticRegression(solver='liblinear')

In [115]:
# model.solver

'liblinear'

In [None]:
train_targets

In [116]:
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)

In [121]:
model.classes_

array(['No', 'Yes'], dtype=object)

In [122]:
model.coef_

array([[ 8.84679816e-01, -3.12025856e+00,  3.19910642e+00,
         7.24774740e-01, -1.61909186e+00,  6.48780955e+00,
        -7.38968521e-01, -1.38625379e+00,  3.06371789e-01,
         5.77461392e+00,  5.30945463e+00, -8.88748007e+00,
        -1.35459332e-01,  1.26696846e+00,  5.42233178e-01,
         2.20972806e+00, -9.03191864e-02,  5.72923876e-01,
        -1.93052983e-01,  4.27377479e-01, -1.27505975e-02,
         2.97775438e-01, -3.34615086e-01,  1.63266392e-01,
         4.13310624e-01, -1.94716195e-02,  5.14178890e-02,
         2.45336268e-01,  2.98665146e-02, -6.49029148e-02,
        -4.63815527e-01, -1.53851775e-01, -5.55546152e-01,
        -7.40224948e-01, -2.69293406e-01, -2.82983509e-01,
        -5.28825335e-01,  6.03724612e-02, -2.53719149e-03,
         7.74912142e-02, -8.75712912e-01, -1.21892895e-01,
         1.28841468e-02, -4.76076797e-01, -4.58028056e-01,
        -6.81340370e-02,  1.49699758e-01,  3.52504389e-01,
         5.78070649e-01,  4.17661484e-01, -1.62250265e-0

In [123]:
model.intercept_

array([-2.3718278])

In [129]:
model.feature_names_in_

array(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am',
       'Cloud3pm', 'Temp9am', 'Temp3pm', 'Year', 'Location_Adelaide',
       'Location_Albany', 'Location_Albury', 'Location_AliceSprings',
       'Location_BadgerysCreek', 'Location_Ballarat', 'Location_Bendigo',
       'Location_Brisbane', 'Location_Cairns', 'Location_Canberra',
       'Location_Cobar', 'Location_CoffsHarbour', 'Location_Dartmoor',
       'Location_Darwin', 'Location_GoldCoast', 'Location_Hobart',
       'Location_Katherine', 'Location_Launceston', 'Location_Melbourne',
       'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree',
       'Location_MountGambier', 'Location_MountGinini',
       'Location_Newcastle', 'Location_Nhil', 'Location_NorahHead',
       'Location_NorfolkIsland', 'Location_Nuriootpa',
       'Location_PearceRAAF', 'Location_Penrith', 

In [118]:
# train_inputs[numeric_cols+encoded_cols].shape

(101018, 116)

In [None]:
print(numeric_cols + encoded_cols)


In [120]:
# train_inputs.head()

In [134]:
pred=model.predict(test_inputs[numeric_cols+encoded_cols])

In [135]:
from sklearn.metrics import accuracy_score