# Lab 5 Exercise 1

In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score

In [51]:
df = pd.read_csv('weatherAUS.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [52]:
# Checking for NaN values
df.isna().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [53]:
# Dropping Column which are not useful
df.drop('Date', axis=1, inplace=True)

In [54]:
# Dropping NaN values
df.dropna(inplace=True)

In [55]:
df.shape

(56420, 22)

In [56]:
# Splitting into X and y dataset
target = df.pop('RainTomorrow')
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
6049,Cobar,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,SW,...,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No
6050,Cobar,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,SSE,...,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No
6052,Cobar,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,NNW,...,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No
6053,Cobar,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,WSW,...,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No
6054,Cobar,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,WNW,...,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No


In [57]:
# Mapping values of y to Numeric Values
mapping = {'No': 0, 'Yes': 1}
target = target.map(mapping)

In [58]:
df.dtypes

Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
dtype: object

In [59]:
# Splitting into Numeric and Categorical Columns
# for numerical_cols
numerical_cols   = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]\
# for categorical_cols
categorical_cols = [cname for cname in df.columns if df[cname].dtype == 'object']

In [60]:
df_num = df[numerical_cols]
df_cat = df[categorical_cols]

In [61]:
# Applying VIF on the Dataset
# Using Linear Regression and calculating the r2 scores
lr = LinearRegression()
vif = {}
for col in numerical_cols:
    all_cols = list(numerical_cols)
    all_cols.remove(col)
    X = df_num[all_cols]
    y = df_num[col]
    lr.fit(X, y)
    y_pred = lr.predict(X)
    score = r2_score(y, y_pred)
    vif[col] = 1 / (1 - score)
    
vif

{'Cloud3pm': 2.2774283640218855,
 'Cloud9am': 2.2382633768389075,
 'Evaporation': 2.224815286348291,
 'Humidity3pm': 6.803333309913033,
 'Humidity9am': 4.417547175413454,
 'MaxTemp': 46.78406795924843,
 'MinTemp': 10.770219691409906,
 'Pressure3pm': 19.86913652734502,
 'Pressure9am': 19.876241743158516,
 'Rainfall': 1.1779953082896786,
 'Sunshine': 3.3077307556062605,
 'Temp3pm': 56.08268800345098,
 'Temp9am': 24.36887019910182,
 'WindGustSpeed': 2.8788643484723133,
 'WindSpeed3pm': 2.1463942950501775,
 'WindSpeed9am': 1.8564893477202455}

In [62]:
# Remove features that have a VIF value above 5.
drop_cols = [key for key in vif.keys() if vif[key] > 5]
drop_cols

['MinTemp',
 'MaxTemp',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Temp9am',
 'Temp3pm']

In [63]:
df_num.drop(drop_cols, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [64]:
# Correlation Matrix
df_num.corr()

Unnamed: 0,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Cloud9am,Cloud3pm
Rainfall,1.0,-0.077239,-0.246379,0.106308,0.050584,0.044112,0.263625,0.217169,0.191433
Evaporation,-0.077239,1.0,0.36925,0.209566,0.193154,0.124345,-0.554232,-0.199809,-0.202366
Sunshine,-0.246379,0.36925,1.0,-0.052422,-0.013842,0.0292,-0.500343,-0.677939,-0.702022
WindGustSpeed,0.106308,0.209566,-0.052422,1.0,0.608852,0.685236,-0.19341,0.088129,0.13159
WindSpeed9am,0.050584,0.193154,-0.013842,0.608852,1.0,0.502226,-0.236795,0.034908,0.062507
WindSpeed3pm,0.044112,0.124345,0.0292,0.685236,0.502226,1.0,-0.100626,0.068224,0.041475
Humidity9am,0.263625,-0.554232,-0.500343,-0.19341,-0.236795,-0.100626,1.0,0.438962,0.348707
Cloud9am,0.217169,-0.199809,-0.677939,0.088129,0.034908,0.068224,0.438962,1.0,0.61438
Cloud3pm,0.191433,-0.202366,-0.702022,0.13159,0.062507,0.041475,0.348707,0.61438,1.0


**Conclusion from the Above Correlation Matrix**
* Considering variables that have correlation more than 0.6
* WindGustSpeed, WindSpeed3pm
* WindGustSpeed, WindSpeed9am
* Cloud9am, Cloud3pm

In [65]:
# Removing Columns with Mulicollinearity
numerical_cols = list(numerical_cols)
numerical_cols.remove('Cloud9am')
numerical_cols.remove('Cloud3pm')
numerical_cols.remove('WindGustSpeed')
numerical_cols.remove('WindSpeed3pm')
numerical_cols.remove('WindSpeed9am')

In [66]:
# Making new Columns with Related Columns
df_num = df[numerical_cols]
df_num['Cloud9am - Cloud3pm'] = df['Cloud9am'] - df['Cloud3pm']
df_num['WindGustSpeed - WindSpeed3pm'] = df['WindGustSpeed'] - df['WindSpeed3pm']
df_num['WindGustSpeed - WindSpeed9am'] = df['WindGustSpeed'] - df['WindSpeed9am']
df_num.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,Cloud9am - Cloud3pm,WindGustSpeed - WindSpeed3pm,WindGustSpeed - WindSpeed9am
6049,17.9,35.2,0.0,12.0,12.3,20.0,13.0,1006.3,1004.4,26.6,33.4,-3.0,28.0,42.0
6050,18.4,28.9,0.0,14.8,13.0,30.0,8.0,1012.9,1012.1,20.3,27.0,0.0,18.0,18.0
6052,19.4,37.6,0.0,10.8,10.6,42.0,22.0,1012.3,1009.2,28.7,34.9,-5.0,31.0,16.0
6053,21.9,38.4,0.0,11.4,12.2,37.0,22.0,1012.7,1009.1,29.1,35.6,-4.0,25.0,25.0
6054,24.2,41.0,0.0,11.2,8.4,19.0,15.0,1010.7,1007.4,33.6,37.6,-5.0,22.0,18.0


In [67]:
# Merging both the Numerical and Categorical Columns
temp_df = pd.merge(df_num, df_cat, left_index = True, right_index = True)
temp_df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,Cloud9am - Cloud3pm,WindGustSpeed - WindSpeed3pm,WindGustSpeed - WindSpeed9am,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday
6049,17.9,35.2,0.0,12.0,12.3,20.0,13.0,1006.3,1004.4,26.6,33.4,-3.0,28.0,42.0,Cobar,SSW,ENE,SW,No
6050,18.4,28.9,0.0,14.8,13.0,30.0,8.0,1012.9,1012.1,20.3,27.0,0.0,18.0,18.0,Cobar,S,SSE,SSE,No
6052,19.4,37.6,0.0,10.8,10.6,42.0,22.0,1012.3,1009.2,28.7,34.9,-5.0,31.0,16.0,Cobar,NNE,NNE,NNW,No
6053,21.9,38.4,0.0,11.4,12.2,37.0,22.0,1012.7,1009.1,29.1,35.6,-4.0,25.0,25.0,Cobar,WNW,WNW,WSW,No
6054,24.2,41.0,0.0,11.2,8.4,19.0,15.0,1010.7,1007.4,33.6,37.6,-5.0,22.0,18.0,Cobar,WNW,NW,WNW,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142298,19.3,33.4,0.0,6.0,11.0,63.0,32.0,1013.9,1010.5,24.5,32.3,-1.0,15.0,26.0,Darwin,ENE,SE,NE,No
142299,21.2,32.6,0.0,7.6,8.6,56.0,28.0,1014.6,1011.2,24.8,32.0,7.0,26.0,24.0,Darwin,E,SE,SE,No
142300,20.7,32.8,0.0,5.6,11.0,46.0,23.0,1015.3,1011.8,24.8,32.1,0.0,22.0,16.0,Darwin,E,E,W,No
142301,19.5,31.8,0.0,6.2,10.6,62.0,58.0,1014.9,1010.7,24.8,29.2,0.0,9.0,17.0,Darwin,ESE,SE,NNW,No


In [68]:
# One Hot Encoding the Categorical Columns
df_new = pd.get_dummies(temp_df)
df_new.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
6049,17.9,35.2,0.0,12.0,12.3,20.0,13.0,1006.3,1004.4,26.6,...,0,0,0,0,1,0,0,0,1,0
6050,18.4,28.9,0.0,14.8,13.0,30.0,8.0,1012.9,1012.1,20.3,...,0,0,1,0,0,0,0,0,1,0
6052,19.4,37.6,0.0,10.8,10.6,42.0,22.0,1012.3,1009.2,28.7,...,0,0,0,0,0,0,0,0,1,0
6053,21.9,38.4,0.0,11.4,12.2,37.0,22.0,1012.7,1009.1,29.1,...,0,0,0,0,0,0,0,1,1,0
6054,24.2,41.0,0.0,11.2,8.4,19.0,15.0,1010.7,1007.4,33.6,...,0,0,0,0,0,0,1,0,1,0


In [69]:
# Converting all Values to Same Datatype
df_new = df_new.astype('float64')
target = target.astype('float64')

In [70]:
# Splitting Dataset into Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(df_new, target, test_size = 0.3, random_state = 42)

In [71]:
# Applying Logistic Regression on The Train Set
model = LogisticRegression(random_state=52)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([0., 0., 0., ..., 0., 0., 0.])

In [72]:
accuracy_score(y_test, y_pred)
# We get accuracy of 85.35%

0.8535980148883374