### Group 28 members:
- Jingze Tian (CCID)
- Letian Ren (CCID)
- Essam Gouda (egouda)

# Task 1: Regression
- Linear Regression for no_of_Mosquito = function(weather_features)
- Linear Regression for no_of_Mosquito_female = function(weather_features)
- Linear Regression for no_of_Mosquito_male = function(weather_features)
- Polynomial Regression for no_of_Mosquito_female = function(weather_features)
- Polynomial Regression for no_of_Mosquito_male = function(weather_features)

#### Variations for each model:
- Different cost functions were tested
- Model without normalization and standardization, model with normalization only, model with standardization only, model with both normalization and standardization.
- Feature selection
- model statistics compared at the end

## Imports

In [None]:
import pandas as pd
import numpy as np
import time


import matplotlib.pyplot as plt
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white bg for sns plots
sns.set(style="whitegrid", color_codes=True)

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

from scipy.stats import ks_2samp

import warnings; warnings.simplefilter('ignore')

## Read csv files (datasets)

In [None]:
weather_data = pd.read_csv('./data/weather_data.csv')
mos_data = pd.read_csv('./data/mosquito_data_new.csv')

In [None]:
mos_data.head()

In [None]:
weather_data.head()

### Standardize mosquito trap date format

In [None]:
t_list = []
for i in mos_data['Trap Date']:
    i = str(i)
    t1 = time.strptime(i,"%m/%d/%Y %H:%M:%S AM")
    t2 = time.strftime("%Y-%m-%d", t1)
    t_list.append(t2)

mos_data['Trap Date'] = t_list

mos_data.head()

### Align date ranges in both datasets
Both datasets have different date ranges

In [None]:
mos_data['Trap Date'] = pd.to_datetime(mos_data['Trap Date'])
start_remove = min(weather_data['date_time_local'])
end_remove = max(weather_data['date_time_local'])
mos_data_1 = mos_data.loc[(mos_data["Trap Date"] > start_remove) & (mos_data["Trap Date"] < end_remove) ]
mos_date_start = str(mos_data_1['Trap Date'].dt.date.min())
mos_date_end = str(mos_data_1['Trap Date'].dt.date.max())
mos_date_range = mos_date_start +' to ' + mos_date_end
print("Range of dates in mosquito dataset is {}".format(mos_date_range))


mos_range = (mos_data['Trap Date'] > mos_date_start) & (mos_data['Trap Date'] < mos_date_end)
mos_data = mos_data.loc[mos_range]
print(mos_data) #
mos_data = mos_data.sort_values(by='Trap Date')
mos_data #final mos_data

### Get total count for each day

In [None]:
mos_count = mos_data.groupby(["Trap Date"]).agg({
    "Count":"sum",
})
mos_count[mos_count['Count'] == mos_count['Count'].max()]

mos_count.head() ####number of mos

In [None]:
len(mos_count.index)

It can be seen that IDd and Include columns contains many NaN values so they will be dropped

In [None]:
mos_data.isnull().sum()

In [None]:
mos_data.drop(columns=['IDd', 'Include'], inplace=True)

In [None]:
mos_data.isnull().sum()

### Select date for weather features

In [None]:
start_date1 = '2017-05-10'#one week before 2017-05-16
end_date1 = '2017-09-26'
start_date2 = '2018-05-09'#one week before 2017-05-15
end_date2 = '2018-09-18'

Drop MST columns
Drop columns with too many NaNs
Drop unixtime columns as they are redundent
Drop wind_dir column as wind_dir_10s gives us the angle of the wind so its redundent

In [None]:
##############Drop MST and columns with too many NaNs and unixtime##################
weather_data.drop(columns=["visibility", "cloud_cover_4", "cloud_cover_8", "cloud_cover_10", "solar_radiation", "wind_gust", "windchill", "humidex", 'unixtime', 'wind_dir'], inplace=True)
weather_data = weather_data.loc[weather_data['date_time_local'].str.contains('MDT')]

weather_data.head()

### Date selection

In [None]:
t_list = []
for i in weather_data['date_time_local']:
    i = str(i)
    t1 = time.strptime(i,"%Y-%m-%d %H:%M:%S MDT")
    t2 = time.strftime("%Y-%m-%d", t1)
    t_list.append(t2)

weather_data['date_time_local'] = t_list
#print(weather_data)

weather_data['date_time_local'] = pd.to_datetime(weather_data['date_time_local'])
weather_range1 = (weather_data['date_time_local'] >= start_date1) & (weather_data['date_time_local'] <= end_date1)
weather_data1 = weather_data.loc[weather_range1]
weather_data1 = weather_data1.sort_values(by='date_time_local')
#print(weather_data1) 

weather_range2 = (weather_data['date_time_local'] >= start_date2) & (weather_data['date_time_local'] <= end_date2)
weather_data2 = weather_data.loc[weather_range2]
weather_data2 = weather_data2.sort_values(by='date_time_local')
#print(weather_data2) 
weather_data = weather_data1.append(weather_data2)
weather_data.head()

#### Fill NaN for weather_data

In [None]:
weather_data.isnull().sum()

In [None]:
len(weather_data.index)

#### To fill the NaNs we will group the weather data by day and there is 3 ways to do that for each column:
- By mean
- By mode
- By median

for each day, to do so we will need to visualize the distribution for each column and ensure that it stays similar after grouping to not add any bias to the data.

### Pressure station

In [None]:
weather_data['pressure_station'].describe()

#### mean

In [None]:
pressure_station_mean = weather_data.groupby('date_time_local').agg({
  'pressure_station': lambda x: round(x.mean(), 2)  
})

pressure_station_mean.describe()

In [None]:
key_to_test = 'pressure_station'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = pressure_station_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
pressure_station_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['pressure_station'], pressure_station_mean['pressure_station'])

#### mode

In [None]:
pressure_station_mode = weather_data.groupby('date_time_local').agg({
  'pressure_station': lambda x: x.value_counts().index[0]  
})

pressure_station_mode.describe()

In [None]:
key_to_test = 'pressure_station'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = pressure_station_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
pressure_station_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['pressure_station'], pressure_station_mode['pressure_station'])

#### median

In [None]:
pressure_station_median = weather_data.groupby('date_time_local').agg({
  'pressure_station': lambda x: x.median() 
})

pressure_station_median.describe()

In [None]:
key_to_test = 'pressure_station'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = pressure_station_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
pressure_station_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['pressure_station'], pressure_station_median['pressure_station'])

It can be seen that for pressure station mode has the highest p-value so it will be chosen for grouping

### Pressure sea

In [None]:
weather_data['pressure_sea'].describe()

#### mean

In [None]:
pressure_sea_mean = weather_data.groupby('date_time_local').agg({
  'pressure_sea': lambda x: round(x.mean(), 2)  
})

pressure_sea_mean.describe()

In [None]:
key_to_test = 'pressure_sea'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = pressure_sea_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
pressure_sea_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['pressure_sea'], pressure_sea_mean['pressure_sea'])

#### mode

In [None]:
pressure_sea_mode = weather_data.groupby('date_time_local').agg({
  'pressure_sea': lambda x: x.value_counts().index[0]  
})

pressure_sea_mode.describe()

In [None]:
key_to_test = 'pressure_sea'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = pressure_sea_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
pressure_sea_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['pressure_sea'], pressure_sea_mode['pressure_sea'])

#### median

In [None]:
pressure_sea_median = weather_data.groupby('date_time_local').agg({
  'pressure_sea': lambda x: x.median() 
})

pressure_sea_median.describe()

In [None]:
key_to_test = 'pressure_sea'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = pressure_sea_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
pressure_sea_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['pressure_sea'], pressure_sea_median['pressure_sea'])

It can be seen that for pressure sea median has the highest p-value so it will be chosen for grouping

## Wind_dir_10s
This column represents the wind direction in angles so taking a mean or median doesn't make much sense and mode is expected to be the best method.

In [None]:
weather_data['wind_dir_10s'].describe()

#### mean

In [None]:
wind_dir_10s_mean = weather_data.groupby('date_time_local').agg({
  'wind_dir_10s': lambda x: round(x.mean(), 1)  
})

wind_dir_10s_mean.describe()

In [None]:
key_to_test = 'wind_dir_10s'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = wind_dir_10s_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
wind_dir_10s_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['wind_dir_10s'], wind_dir_10s_mean['wind_dir_10s'])

#### mode

In [None]:
wind_dir_10s_mode = weather_data.groupby('date_time_local').agg({
  'wind_dir_10s': lambda x: x.value_counts().index[0]   
})

wind_dir_10s_mode.describe()

In [None]:
key_to_test = 'wind_dir_10s'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = wind_dir_10s_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
wind_dir_10s_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['wind_dir_10s'], wind_dir_10s_mode['wind_dir_10s'])

#### median

In [None]:
wind_dir_10s_median = weather_data.groupby('date_time_local').agg({
  'wind_dir_10s': lambda x: x.median()  
})

wind_dir_10s_median.describe()

In [None]:
key_to_test = 'wind_dir_10s'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = wind_dir_10s_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
wind_dir_10s_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['wind_dir_10s'], wind_dir_10s_median['wind_dir_10s'])

As expected mode will be the chosen value for wind_dir_10s column as it has the highest p-value, it can be seen that mean doesn't even follow the same distribution and the median barely does follow it.

## Wind_speed

In [None]:
weather_data['wind_speed'].describe()

#### mean

In [None]:
wind_speed_mean = weather_data.groupby('date_time_local').agg({
  'wind_speed': lambda x: round(x.mean(),1)  
})

wind_speed_mean.describe()

In [None]:
key_to_test = 'wind_speed'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = wind_speed_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
wind_speed_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['wind_speed'], wind_speed_mean['wind_speed'])

#### mode

In [None]:
wind_speed_mode = weather_data.groupby('date_time_local').agg({
  'wind_speed': lambda x: x.value_counts().index[0]  
})

wind_speed_mode.describe()

In [None]:
key_to_test = 'wind_speed'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = wind_speed_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
wind_speed_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['wind_speed'], wind_speed_mode['wind_speed'])

#### median

In [None]:
wind_speed_median = weather_data.groupby('date_time_local').agg({
  'wind_speed': lambda x: x.median()  
})

wind_speed_median.describe()

In [None]:
key_to_test = 'wind_speed'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = wind_speed_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
wind_speed_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['wind_speed'], wind_speed_median['wind_speed'])

Mode will be used for wind_speed

## Relative_humidty

In [None]:
weather_data['relative_humidity'].describe()

#### mean

In [None]:
relative_humidity_mean = weather_data.groupby('date_time_local').agg({
  'relative_humidity': lambda x: round(x.mean(),1)
})

relative_humidity_mean.describe()

In [None]:
key_to_test = 'relative_humidity'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = relative_humidity_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
relative_humidity_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['relative_humidity'], relative_humidity_mean['relative_humidity'])

#### mode

In [None]:
relative_humidity_mode = weather_data.groupby('date_time_local').agg({
  'relative_humidity': lambda x: x.value_counts().index[0]
})

relative_humidity_mode.describe()

In [None]:
key_to_test = 'relative_humidity'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = relative_humidity_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
relative_humidity_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['relative_humidity'], relative_humidity_mode['relative_humidity'])

#### median

In [None]:
relative_humidity_median = weather_data.groupby('date_time_local').agg({
  'relative_humidity': lambda x: x.median()
})

relative_humidity_median.describe()

In [None]:
key_to_test = 'relative_humidity'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = relative_humidity_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
relative_humidity_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['relative_humidity'], relative_humidity_median['relative_humidity'])

Grouping relative_humidty will introduce bias in the dataset so it will be ignored

## Dew_point

In [None]:
weather_data['dew_point'].describe()

#### mean

In [None]:
dew_point_mean = weather_data.groupby('date_time_local').agg({
  'dew_point': lambda x: round(x.mean(),1)
})

dew_point_mean.describe()

In [None]:
key_to_test = 'dew_point'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = dew_point_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
dew_point_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['dew_point'], dew_point_mean['dew_point'])

#### mode

In [None]:
dew_point_mode = weather_data.groupby('date_time_local').agg({
  'dew_point': lambda x: x.value_counts().index[0]
})

dew_point_mode.describe()

In [None]:
key_to_test = 'dew_point'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = dew_point_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
dew_point_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['dew_point'], dew_point_mode['dew_point'])

#### median

In [None]:
dew_point_median = weather_data.groupby('date_time_local').agg({
  'dew_point': lambda x: x.median()
})

dew_point_median.describe()

In [None]:
key_to_test = 'dew_point'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = dew_point_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
dew_point_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['dew_point'], dew_point_median['dew_point'])

Mode will be used for dew_point

## Temperature

In [None]:
weather_data['temperature'].describe()

#### mean

In [None]:
temperature_mean = weather_data.groupby('date_time_local').agg({
  'temperature': lambda x: round(x.mean(),1)
})

temperature_mean.describe()

In [None]:
key_to_test = 'temperature'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = temperature_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
temperature_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['temperature'], temperature_mean['temperature'])

#### mode

In [None]:
temperature_mode = weather_data.groupby('date_time_local').agg({
  'temperature': lambda x: x.value_counts().index[0]
})

temperature_mode.describe()

In [None]:
key_to_test = 'temperature'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = temperature_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
temperature_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['temperature'], temperature_mode['temperature'])

#### median

In [None]:
temperature_median = weather_data.groupby('date_time_local').agg({
  'temperature': lambda x: x.median()
})

temperature_median.describe()

In [None]:
key_to_test = 'temperature'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = temperature_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
temperature_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['temperature'], temperature_median['temperature'])

Mode will be used for temperature

## Health_index

In [None]:
weather_data['health_index'].describe()

#### mean

In [None]:
health_index_mean = weather_data.groupby('date_time_local').agg({
  'health_index': lambda x: round(x.mean(),1)
})

health_index_mean.describe()

In [None]:
key_to_test = 'health_index'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = health_index_mean[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
health_index_mean[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mean data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['health_index'], health_index_mean['health_index'])

#### mode

In [None]:
health_index_mode = weather_data.groupby('date_time_local').agg({
  'health_index': lambda x: x.value_counts().index[0]
})

health_index_mode.describe()

In [None]:
key_to_test = 'health_index'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = health_index_mode[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
health_index_mode[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Mode data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['health_index'], health_index_mode['health_index'])

#### median

In [None]:
health_index_median = weather_data.groupby('date_time_local').agg({
  'health_index': lambda x: x.median()
})

health_index_median.describe()

In [None]:
key_to_test = 'health_index'

plt.figure(figsize=(15, 8))
ax = weather_data[key_to_test].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
weather_data[key_to_test].plot(kind='density', color='teal')


ax = health_index_median[key_to_test].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.6)
health_index_median[key_to_test].plot(kind='density', color='orange')


ax.legend(['Original data', 'Median data'])
ax.set(xlabel=key_to_test)

plt.show()

In [None]:
ks_2samp(weather_data['health_index'], health_index_median['health_index'])

Mean will be used for health_index

## Grouping weather data by date
- Pressure station by mode
- Pressure sea by median
- wind_dir_10s by mode
- wind_speed by mode
- relative_humidty ignored
- dew_point by mode
- temperature by mode
- health_index by mean

In [None]:
weather_data_grouped = weather_data.groupby('date_time_local', as_index=False).agg({
    'pressure_station': lambda x: x.value_counts().index[0],
    'pressure_sea': lambda x: x.median(),
    'wind_dir_10s': lambda x: x.value_counts().index[0],
    'wind_speed': lambda x: x.value_counts().index[0],
    'dew_point': lambda x: x.value_counts().index[0],
    'temperature': lambda x: x.value_counts().index[0],
    'health_index': lambda x: round(x.mean(), 1)
})

print(len(weather_data_grouped.index))

weather_data_grouped.head()

In [None]:
weather_data_grouped.isnull().sum()

#### Further group weather data by week

In [None]:
weather_data_grouped.sort_values('date_time_local', inplace=True)
weather_data_grouped.head(7)

In [None]:
weather_data_grouped = weather_data_grouped.groupby(weather_data_grouped.index // 7).agg({
    'date_time_local': 'last',
    'pressure_station': lambda x: x.value_counts().index[0],
    'pressure_sea': lambda x: x.median(),
    'wind_dir_10s': lambda x: x.value_counts().index[0],
    'wind_speed': lambda x: x.value_counts().index[0],
    'dew_point': lambda x: x.value_counts().index[0],
    'temperature': lambda x: x.value_counts().index[0],
    'health_index': lambda x: round(x.mean(), 1)
})

print(len(weather_data_grouped.index))

## Grouping mosquito data


There are 3 candidate columns to group mosquito data by:
- Trap_date: date of trapping mosquitos
- Genus: When biologists talk about a genus, they mean one or more species of animals or plants that are closely related to each other. Low-level taxonomic ranking for biological classification.
- Specific Epithet: lowest taxonomic rank and having common characteristics and (usually) capable of mating with one another.

Add Gender for part B

In [None]:
mos_data.head()

In [None]:
mos_data_grouped = mos_data.groupby(['Trap Date'], as_index=False).agg({
    'Count' : 'sum',
    'Genus': lambda x: x.value_counts().index[0],
    'Gender': lambda x: x.value_counts().index[0]
})

print(len(mos_data_grouped.index))

mos_data_grouped.head()

In [None]:
merged_data = pd.merge(left=mos_data_grouped, right=weather_data_grouped, left_on='Trap Date', right_on='date_time_local')

print(len(merged_data.index))

merged_data.drop(columns=['date_time_local'], inplace=True) #redundent

merged_data.head()

In [None]:
merged_data.isnull().sum()

## Add features
- isWarm: if temperature is above 20 = 1, else = 0.
- add genusCat for Genus
- add genderCat female = 1, male = 0
- add delta_pressure = pressure_station - pressure_sea

In [None]:
merged_data['isWarm'] = np.where(merged_data['temperature'] >= 20, 1, 0)

In [None]:
merged_data['Genus'] = merged_data['Genus'].astype('category')
merged_data['genusCat'] = merged_data['Genus'].cat.codes

In [None]:
# merged_data['genderCat'] = np.where(merged_data['Gender'] == "Female", 1, 0)

In [None]:
merged_data["delta_pressure"] = merged_data.apply(lambda x: x['pressure_station'] - x['pressure_sea'], axis=1)

In [None]:
merged_data.head()

# Part A: Linear Regression for no_of_Mosquito = function(weather_features)

In [None]:
sns.distplot(merged_data['Count'])

In [None]:
merged_data['Count'].describe()

In [None]:
#merged_data = merged_data[merged_data['Count'] < merged_data['Count'].describe()['75%']]  

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(merged_data.corr(), annot=True, ax=ax)

In [None]:
x_cols = ['wind_dir_10s', 'wind_speed', 'genusCat', 'delta_pressure']
X = merged_data[x_cols]
y = merged_data['Count']

In [None]:
from sklearn.preprocessing import StandardScaler

x_cols = ['wind_dir_10s', 'wind_speed', 'delta_pressure']


X_stand = X.copy()
X_norm = X.copy()
X_both = X.copy()

X_stand[x_cols] = StandardScaler().fit_transform(X[x_cols])
X_norm[x_cols] = X[x_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X_both[x_cols] = StandardScaler().fit_transform(X[x_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))[x_cols])

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train_stand, X_test_stand, y_train_stand, y_test_stand = train_test_split(X_stand, y, test_size=test_size, random_state=42)
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y, test_size=test_size, random_state=42)
X_train_both, X_test_both, y_train_both, y_test_both = train_test_split(X_both, y, test_size=test_size, random_state=42)


In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

lin_reg_norm = LinearRegression()
lin_reg_norm.fit(X_train_norm,y_train_norm)

lin_reg_stand = LinearRegression()
lin_reg_stand.fit(X_train_stand,y_train_stand)

lin_reg_both = LinearRegression()
lin_reg_both.fit(X_train_both,y_train_both)

In [None]:
print(lin_reg.intercept_)
coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
print(lin_reg_norm.intercept_)
coeff_df = pd.DataFrame(lin_reg_norm.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
print(lin_reg_stand.intercept_)
coeff_df = pd.DataFrame(lin_reg_stand.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
print(lin_reg_both.intercept_)
coeff_df = pd.DataFrame(lin_reg_both.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
pred = lin_reg.predict(X_test)
pred_norm = lin_reg_norm.predict(X_test_norm)
pred_stand = lin_reg_stand.predict(X_test_stand)
pred_both = lin_reg_both.predict(X_test_both)


plt.figure(figsize=(15, 8))
ax = plt.gca()

ax.scatter(y_test, pred, c='#2ca02c', marker='x', label='Original data')
ax.scatter(y_test_norm, pred, c='r', marker='o', label='Normalized data')
ax.scatter(y_test_stand, pred, c='b', marker='+', label='Standardized data')
ax.scatter(y_test_both, pred, c='y', marker='s', label='Both data')

plt.legend(loc='best');

plt.show()

In [None]:
sns.distplot((y_test - pred), bins=50);

In [None]:
test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
test_pred_norm = lin_reg_norm.predict(X_test_norm)
train_pred_norm = lin_reg_norm.predict(X_train_norm)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_norm, test_pred_norm)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_norm, train_pred_norm)

In [None]:
test_pred_stand = lin_reg_stand.predict(X_test_stand)
train_pred_stand = lin_reg_stand.predict(X_train_stand)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_stand, test_pred_stand)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_stand, train_pred_stand)

In [None]:
test_pred_both = lin_reg_both.predict(X_test_both)
train_pred_both = lin_reg_both.predict(X_train_both)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_both, test_pred_both)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_both, train_pred_both)

In [None]:
results_df = pd.DataFrame(data=[["Linear Regression", *evaluate(y_test, test_pred) , cross_val(LinearRegression())], ["Linear Regression w/ norm", *evaluate(y_test_norm, test_pred_norm) , cross_val(LinearRegression())], ["Linear Regression w/ stand", *evaluate(y_test_stand, test_pred_stand) , cross_val(LinearRegression())],["Linear Regression w/ both", *evaluate(y_test_both, test_pred_both) , cross_val(LinearRegression())]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df

# Part B: count depending on Gender

In [None]:
mos_data_grouped_B = mos_data.groupby(['Trap Date', 'Gender'], as_index=False).agg({
    'Count' : 'sum',
    'Genus': lambda x: x.value_counts().index[0],
})

print(len(mos_data_grouped_B.index))

mos_data_grouped_B.head()

In [None]:
mos_data_grouped_B_females = mos_data_grouped_B[mos_data_grouped_B["Gender"] == "Female"]
mos_data_grouped_B_males = mos_data_grouped_B[mos_data_grouped_B["Gender"] == "Male"]

In [None]:
print(len(mos_data_grouped_B_females.index), len(mos_data_grouped_B_males.index))

## Linear Regression

### Females

In [None]:
merged_data_females = pd.merge(left=mos_data_grouped_B_females, right=weather_data_grouped, left_on='Trap Date', right_on='date_time_local')

print(len(merged_data_females.index))

merged_data_females.drop(columns=['date_time_local'], inplace=True) #redundent

merged_data_females.head()

In [None]:
sns.distplot(merged_data_females['Count'])

In [None]:
merged_data_females['isWarm'] = np.where(merged_data_females['temperature'] >= 20, 1, 0)

In [None]:
merged_data_females['Genus'] = merged_data_females['Genus'].astype('category')
merged_data_females['genusCat'] = merged_data_females['Genus'].cat.codes

In [None]:
merged_data_females["delta_pressure"] = merged_data_females.apply(lambda x: x['pressure_station'] - x['pressure_sea'], axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(merged_data_females.corr(), annot=True, ax=ax)

In [None]:
x_cols = ['pressure_station', 'wind_speed', 'dew_point', 'temperature', 'genusCat', 'delta_pressure']
X = merged_data_females[x_cols]
y = merged_data_females['Count']

In [None]:
from sklearn.preprocessing import StandardScaler

x_cols = ['pressure_station', 'wind_speed', 'dew_point', 'temperature', 'delta_pressure']


X_stand = X.copy()
X_norm = X.copy()
X_both = X.copy()

X_stand[x_cols] = StandardScaler().fit_transform(X[x_cols])
X_norm[x_cols] = X[x_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X_both[x_cols] = StandardScaler().fit_transform(X[x_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))[x_cols])

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train_stand, X_test_stand, y_train_stand, y_test_stand = train_test_split(X_stand, y, test_size=test_size, random_state=42)
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y, test_size=test_size, random_state=42)
X_train_both, X_test_both, y_train_both, y_test_both = train_test_split(X_both, y, test_size=test_size, random_state=42)


In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

lin_reg_norm = LinearRegression()
lin_reg_norm.fit(X_train_norm,y_train_norm)

lin_reg_stand = LinearRegression()
lin_reg_stand.fit(X_train_stand,y_train_stand)

lin_reg_both = LinearRegression()
lin_reg_both.fit(X_train_both,y_train_both)

In [None]:
print(lin_reg.intercept_)
coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
print(lin_reg_norm.intercept_)
coeff_df = pd.DataFrame(lin_reg_norm.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
print(lin_reg_stand.intercept_)
coeff_df = pd.DataFrame(lin_reg_stand.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
print(lin_reg_both.intercept_)
coeff_df = pd.DataFrame(lin_reg_both.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
pred = lin_reg.predict(X_test)
pred_norm = lin_reg_norm.predict(X_test_norm)
pred_stand = lin_reg_stand.predict(X_test_stand)
pred_both = lin_reg_both.predict(X_test_both)


plt.figure(figsize=(15, 8))
ax = plt.gca()

ax.scatter(y_test, pred, c='#2ca02c', marker='x', label='Original data')
ax.scatter(y_test_norm, pred, c='r', marker='o', label='Normalized data')
ax.scatter(y_test_stand, pred, c='b', marker='+', label='Standardized data')
ax.scatter(y_test_both, pred, c='y', marker='s', label='Both data')

plt.legend(loc='best');

plt.show()

In [None]:
sns.distplot((y_test - pred), bins=50);

In [None]:
test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
test_pred_norm = lin_reg_norm.predict(X_test_norm)
train_pred_norm = lin_reg_norm.predict(X_train_norm)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_norm, test_pred_norm)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_norm, train_pred_norm)

In [None]:
test_pred_stand = lin_reg_stand.predict(X_test_stand)
train_pred_stand = lin_reg_stand.predict(X_train_stand)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_stand, test_pred_stand)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_stand, train_pred_stand)

In [None]:
test_pred_both = lin_reg_both.predict(X_test_both)
train_pred_both = lin_reg_both.predict(X_train_both)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_both, test_pred_both)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_both, train_pred_both)

In [None]:
results_df = pd.DataFrame(data=[["Linear Regression", *evaluate(y_test, test_pred) , cross_val(LinearRegression())], ["Linear Regression w/ norm", *evaluate(y_test_norm, test_pred_norm) , cross_val(LinearRegression())], ["Linear Regression w/ stand", *evaluate(y_test_stand, test_pred_stand) , cross_val(LinearRegression())],["Linear Regression w/ both", *evaluate(y_test_both, test_pred_both) , cross_val(LinearRegression())]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df

## Polynomial Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

### Males

In [None]:
merged_data_males = pd.merge(left=mos_data_grouped_B_males, right=weather_data_grouped, left_on='Trap Date', right_on='date_time_local')

print(len(merged_data_males.index))

merged_data_males.drop(columns=['date_time_local'], inplace=True) #redundent

merged_data_males.head()

In [None]:
sns.distplot(merged_data_males['Count'])

In [None]:
merged_data_males['isWarm'] = np.where(merged_data_males['temperature'] >= 20, 1, 0)

In [None]:
merged_data_males['Genus'] = merged_data_males['Genus'].astype('category')
merged_data_males['genusCat'] = merged_data_males['Genus'].cat.codes

In [None]:
merged_data_males["delta_pressure"] = merged_data_males.apply(lambda x: x['pressure_station'] - x['pressure_sea'], axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(merged_data_males.corr(), annot=True, ax=ax)

In [None]:
x_cols = [ 'wind_speed', 'temperature', 'health_index']
X = merged_data_males[x_cols]
y = merged_data_males['Count']

In [None]:
def compute_mse(X_train, Y_train, X_test, Y_test, n_degree=11):
    train_mse = []
    test_mse = []
    for degree in range(1,n_degree):
        poly_features = PolynomialFeatures(degree=degree)
        sklreg = LinearRegression()
        pipeline = Pipeline([("polynomial_features", poly_features),
                             ("linear_regression", sklreg)])
        pipeline.fit(X_train, Y_train)
        Y_train_pred = pipeline.predict(X_train)
        Y_test_pred = pipeline.predict(X_test)

        train_mse.append(mean_squared_error(Y_train, Y_train_pred))
        test_mse.append(mean_squared_error(Y_test, Y_test_pred))
    return train_mse,test_mse

In [None]:
from sklearn.preprocessing import StandardScaler

x_cols = ['wind_speed', 'temperature', 'health_index']


X_stand = X.copy()
X_norm = X.copy()
X_both = X.copy()

X_stand[x_cols] = StandardScaler().fit_transform(X[x_cols])
X_norm[x_cols] = X[x_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X_both[x_cols] = StandardScaler().fit_transform(X[x_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))[x_cols])

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train_stand, X_test_stand, y_train_stand, y_test_stand = train_test_split(X_stand, y, test_size=test_size, random_state=42)
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y, test_size=test_size, random_state=42)
X_train_both, X_test_both, y_train_both, y_test_both = train_test_split(X_both, y, test_size=test_size, random_state=42)

In [None]:
train_mse,test_mse = compute_mse(X_train,y_train,X_test,y_test)
train_mse_stand,test_mse_stand, = compute_mse(X_train_stand,y_train_stand,X_test_stand,y_test_stand)
train_mse_norm,test_mse_norm = compute_mse(X_train_norm,y_train_norm,X_test_norm,y_test_norm)
train_mse_both,test_mse_both = compute_mse(X_train_both,y_train_both,X_test_both,y_test_both)

In [None]:
plt.plot(range(1,len(train_mse)+1),train_mse, label='train')
plt.plot(range(1,len(test_mse)+1),test_mse, label='test')
plt.title('MSE')
plt.legend()

In [None]:
plt.plot(range(1,len(train_mse_stand)+1),train_mse_stand, label='train')
plt.plot(range(1,len(test_mse_stand)+1),test_mse_stand, label='test')
plt.title('MSE')
plt.legend()

In [None]:
plt.plot(range(1,len(train_mse_norm)+1),train_mse_norm, label='train')
plt.plot(range(1,len(test_mse_norm)+1),test_mse_norm, label='test')
plt.title('MSE')
plt.legend()

In [None]:
plt.plot(range(1,len(train_mse_both)+1),train_mse_both, label='train')
plt.plot(range(1,len(test_mse_both)+1),test_mse_both, label='test')
plt.title('MSE')
plt.legend()

In [None]:
print("Best degree none: ",np.argmin(train_mse)+1)
print("Best degree standarization: ",np.argmin(train_mse_stand)+1)
print("Best degree normalization: ",np.argmin(train_mse_norm)+1)
print("Best degree both: ",np.argmin(train_mse_both)+1)

In [None]:
poly_features = PolynomialFeatures(degree=np.argmin(train_mse)+1, include_bias=False)
X_poly = poly_features.fit_transform(X)

poly_features_stand = PolynomialFeatures(degree=np.argmin(train_mse_stand )+1, include_bias=False)
X_poly_stand = poly_features_stand.fit_transform(X_stand)

poly_features_norm = PolynomialFeatures(degree=np.argmin(train_mse_norm)+1, include_bias=False)
X_poly_norm = poly_features_norm.fit_transform(X_norm)

poly_features_both = PolynomialFeatures(degree=np.argmin(train_mse_both)+1, include_bias=False)
X_poly_both = poly_features_both.fit_transform(X_both)

In [None]:
X_best = SelectKBest(f_regression,5).fit_transform(X_poly, y)
X_best_stand = SelectKBest(f_regression,5).fit_transform(X_poly_stand, y)
X_best_norm = SelectKBest(f_regression,5).fit_transform(X_poly_norm, y)
X_best_both = SelectKBest(f_regression,5).fit_transform(X_poly_both, y)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_best,y)

lin_reg_norm = LinearRegression()
lin_reg_norm.fit(X_best_norm,y)

lin_reg_stand = LinearRegression()
lin_reg_stand.fit(X_best_stand,y)

lin_reg_both = LinearRegression()
lin_reg_both.fit(X_best_both,y)

In [None]:
# import sklearn as sk
# stand_scaler= sk.preprocessing.StandardScaler()
#X_train_norm, X_test_norm, y_train_norm, y_test_norm
poly_features = PolynomialFeatures(degree=np.argmin(train_mse)+1,include_bias=False)
selectbest=SelectKBest(f_regression,5)
polpipeline = Pipeline([("polynomial_features", poly_features),
                        ("SelectKBest",selectbest),
                     ("linear_regression", LinearRegression())])
polpipeline.fit(X_train_norm,y_train_norm)
pred_pip_train=polpipeline.predict(X_train_norm)
pred_pip_test=polpipeline.predict(X_test_norm)
print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test_norm, pred_pip_test)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train_norm, pred_pip_train)

In [None]:
pred = lin_reg.predict(X_best)
pred_norm = lin_reg_norm.predict(X_best_norm)
pred_stand = lin_reg_stand.predict(X_best_stand)
pred_both = lin_reg_both.predict(X_best_both)


plt.figure(figsize=(15, 8))
ax = plt.gca()

ax.scatter(y, pred, c='#2ca02c', marker='x', label='Original data')
ax.scatter(y, pred, c='r', marker='o', label='Normalized data')
ax.scatter(y, pred, c='b', marker='+', label='Standardized data')
ax.scatter(y, pred, c='y', marker='s', label='Both data')

plt.legend(loc='best');

plt.show()

In [None]:
sns.distplot((y - pred), bins=50);

In [None]:
print('Train set evaluation:\n_____________________________________')
print_evaluate(y, pred)

In [None]:
print('Train set evaluation:\n_____________________________________')
print_evaluate(y, pred_norm)

In [None]:
print('Train set evaluation:\n_____________________________________')
print_evaluate(y, pred_stand)

In [None]:
print('Train set evaluation:\n_____________________________________')
print_evaluate(y, pred_both)

In [None]:
results_df = pd.DataFrame(data=[["Polynomial Regression", *evaluate(y, pred) , cross_val(LinearRegression())], ["Polynomial Regression w/ norm", *evaluate(y_test_norm, test_pred_norm) , cross_val(LinearRegression())], ["Polynomial Regression w/ stand", *evaluate(y_test_stand, test_pred_stand) , cross_val(LinearRegression())],["Linear Regression w/ both", *evaluate(y_test_both, test_pred_both) , cross_val(LinearRegression())]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df