# Execute the code below

In [3]:
import pandas as pd
import numpy as np
link_main = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather_main_2018.csv"
link_opinion = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather_opinion_2018.csv"
df_main = pd.read_csv(link_main)
df_opinion = pd.read_csv(link_opinion)
print(df_main.head())
print(df_opinion.head())

         DATE  MAX_TEMPERATURE_C  ...  DEWPOINT_MAX_C  WINDTEMP_MAX_C
0  2018-01-01                 12  ...               8               7
1  2018-01-02                 13  ...              12               6
2  2018-01-03                 15  ...              13               7
3  2018-01-04                 14  ...              12              10
4  2018-01-05                 12  ...              10               7

[5 rows x 15 columns]
         date  WEATHER_CODE_EVENING  TOTAL_SNOW_MM  UV_INDEX  SUNHOUR OPINION
0  2018-01-01                   113              0         3      5.1     bad
1  2018-03-12                   119              0         2      8.8     bad
2  2018-03-09                   116              0         3     10.2     bad
3  2018-10-07                   122              0         1      5.6     bad
4  2018-06-18                   119              0         1     12.9     bad


# Classification challenge

Your goal are :
- to merge both 2018 DataFrames
- to train-test split the new 2018 DataFrame
- to train 3 differents Machine Learning algorithms (KNN, logistic regression and decision tree) with "opinion" as target
- to try different parameters
- to find the best accuracy score (on the test set of course)
- to fill the missing values in the "opinion" columns whith your best model
- to explain what is the "rules" used by your model to predict the opinion.

You can help yourself with charts if you want.

## 0. Import modules

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


## 1. Merging 2018 DF ✔

In [5]:
print(df_main.columns)
print(df_opinion.columns)

Index(['DATE', 'MAX_TEMPERATURE_C', 'MIN_TEMPERATURE_C', 'WINDSPEED_MAX_KMH',
       'TEMPERATURE_MORNING_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_EVENING_C',
       'PRECIP_TOTAL_DAY_MM', 'HUMIDITY_MAX_PERCENT', 'VISIBILITY_AVG_KM',
       'PRESSURE_MAX_MB', 'CLOUDCOVER_AVG_PERCENT', 'HEATINDEX_MAX_C',
       'DEWPOINT_MAX_C', 'WINDTEMP_MAX_C'],
      dtype='object')
Index(['date', 'WEATHER_CODE_EVENING', 'TOTAL_SNOW_MM', 'UV_INDEX', 'SUNHOUR',
       'OPINION'],
      dtype='object')


In [6]:
df_weather = df_main.merge(df_opinion, how = 'left', left_on = 'DATE', right_on = 'date').drop('date', axis = 1)
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 0 to 364
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DATE                    365 non-null    object 
 1   MAX_TEMPERATURE_C       365 non-null    int64  
 2   MIN_TEMPERATURE_C       365 non-null    int64  
 3   WINDSPEED_MAX_KMH       365 non-null    int64  
 4   TEMPERATURE_MORNING_C   365 non-null    int64  
 5   TEMPERATURE_NOON_C      365 non-null    int64  
 6   TEMPERATURE_EVENING_C   365 non-null    int64  
 7   PRECIP_TOTAL_DAY_MM     365 non-null    float64
 8   HUMIDITY_MAX_PERCENT    365 non-null    int64  
 9   VISIBILITY_AVG_KM       365 non-null    float64
 10  PRESSURE_MAX_MB         365 non-null    int64  
 11  CLOUDCOVER_AVG_PERCENT  365 non-null    float64
 12  HEATINDEX_MAX_C         365 non-null    int64  
 13  DEWPOINT_MAX_C          365 non-null    int64  
 14  WINDTEMP_MAX_C          365 non-null    in

In [7]:
df_weather.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
DATE,365,365.0,2018-03-19,1.0,,,,,,,
MAX_TEMPERATURE_C,365,,,,16.8795,6.77396,1.0,12.0,16.0,23.0,33.0
MIN_TEMPERATURE_C,365,,,,9.31507,5.33925,-6.0,6.0,9.0,14.0,21.0
WINDSPEED_MAX_KMH,365,,,,19.2055,8.68205,4.0,13.0,18.0,24.0,61.0
TEMPERATURE_MORNING_C,365,,,,10.263,5.60532,-6.0,6.0,10.0,15.0,22.0
TEMPERATURE_NOON_C,365,,,,16.3452,6.74954,0.0,11.0,16.0,22.0,32.0
TEMPERATURE_EVENING_C,365,,,,13.863,6.8659,-3.0,9.0,13.0,20.0,30.0
PRECIP_TOTAL_DAY_MM,365,,,,1.05781,2.1401,0.0,0.0,0.1,1.1,15.3
HUMIDITY_MAX_PERCENT,365,,,,78.1233,11.9958,32.0,70.0,79.0,88.0,98.0
VISIBILITY_AVG_KM,365,,,,9.08733,1.23683,2.75,8.375,9.75,10.0,10.0


## 2. Train-test split 2018 DF ✔


### Split train test sets and prediction set


Train test set

In [8]:
df_clean = df_weather.dropna()
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 341 entries, 0 to 364
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DATE                    341 non-null    object 
 1   MAX_TEMPERATURE_C       341 non-null    int64  
 2   MIN_TEMPERATURE_C       341 non-null    int64  
 3   WINDSPEED_MAX_KMH       341 non-null    int64  
 4   TEMPERATURE_MORNING_C   341 non-null    int64  
 5   TEMPERATURE_NOON_C      341 non-null    int64  
 6   TEMPERATURE_EVENING_C   341 non-null    int64  
 7   PRECIP_TOTAL_DAY_MM     341 non-null    float64
 8   HUMIDITY_MAX_PERCENT    341 non-null    int64  
 9   VISIBILITY_AVG_KM       341 non-null    float64
 10  PRESSURE_MAX_MB         341 non-null    int64  
 11  CLOUDCOVER_AVG_PERCENT  341 non-null    float64
 12  HEATINDEX_MAX_C         341 non-null    int64  
 13  DEWPOINT_MAX_C          341 non-null    int64  
 14  WINDTEMP_MAX_C          341 non-null    in

Prediction set

In [9]:
df_nan = df_weather[df_weather.isna().any(axis=1)]
df_nan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 18 to 349
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DATE                    24 non-null     object 
 1   MAX_TEMPERATURE_C       24 non-null     int64  
 2   MIN_TEMPERATURE_C       24 non-null     int64  
 3   WINDSPEED_MAX_KMH       24 non-null     int64  
 4   TEMPERATURE_MORNING_C   24 non-null     int64  
 5   TEMPERATURE_NOON_C      24 non-null     int64  
 6   TEMPERATURE_EVENING_C   24 non-null     int64  
 7   PRECIP_TOTAL_DAY_MM     24 non-null     float64
 8   HUMIDITY_MAX_PERCENT    24 non-null     int64  
 9   VISIBILITY_AVG_KM       24 non-null     float64
 10  PRESSURE_MAX_MB         24 non-null     int64  
 11  CLOUDCOVER_AVG_PERCENT  24 non-null     float64
 12  HEATINDEX_MAX_C         24 non-null     int64  
 13  DEWPOINT_MAX_C          24 non-null     int64  
 14  WINDTEMP_MAX_C          24 non-null     in

### Recode OPINION labels

In [10]:
df_clean['OPINION'] = df_clean['OPINION'].apply(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df_clean['le_OPINION'] = le.fit_transform(df_clean['OPINION'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## 3. Train 3 machine learning model


In [0]:
col_list = ['MAX_TEMPERATURE_C', 'MIN_TEMPERATURE_C', 'WINDSPEED_MAX_KMH',
       'TEMPERATURE_MORNING_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_EVENING_C',
       'PRECIP_TOTAL_DAY_MM', 'HUMIDITY_MAX_PERCENT', 'VISIBILITY_AVG_KM',
       'PRESSURE_MAX_MB', 'CLOUDCOVER_AVG_PERCENT', 'HEATINDEX_MAX_C',
       'DEWPOINT_MAX_C', 'WINDTEMP_MAX_C']

### a. kNN Classification

In [0]:
def best_kNNcls_training(data, y_column, X_columns):
  ''' Give the best model for kNN classification''' 
  from sklearn.model_selection import train_test_split
  from sklearn.neighbors import KNeighborsClassifier
  dico = {}

  # First step: loop to sort variables from best to worst explanation 
  for col in X_columns: 
    for k in range(2, 21):
      X = data[[col]]
      y = data[y_column]
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)
      kNNclsModel = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
      score_train = kNNclsModel.score(X_train, y_train)
      dico[col] = score_train

  a = sorted(dico.items(), key=lambda t: t[1], reverse = True)

  var_list = []
  for i in range(len(a)):
    var_list.append(a[i][0]) 

  # Second step: loop to find the best model
  X_cols = []
  score_max_train = 0
  for var in var_list: 
    for k in range(2, 21):
      X_cols.append(var) 
      X = data[X_cols]
      y = data[y_column]
      X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)
      kNNclsModel = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
      score_train = kNNclsModel.score(X_train, y_train)
      score_test = kNNclsModel.score(X_test, y_test)

      if round(abs(score_train), 2) > round(abs(score_max_train), 2):
        best_variables = X_cols
        score_max_train = score_train
        score_max_test = score_test
        neighbors = k

  print('\n kNN Classification model with the best variables :\n - variables : {} \n - n_neighbors : {} \n - R2 train = {} \n - R2 test = {}'.format(best_variables, neighbors, score_max_train, score_max_test))

  return kNNclsModel, best_variables, neighbors

In [37]:
kNNcls, columns, k = best_kNNcls_training(data = df_clean, y_column = 'le_OPINION', X_columns = col_list)


 LR model with the 2 best variables :
 - variables : ['HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'HEATINDEX_MAX_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'MAX_TEMPERATURE_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_NOON_C', 'TEMPERAT

### b. kNN regression

In [0]:
def Best_kNNreg_training(data, y_column, X_columns):
  from sklearn.model_selection import train_test_split
  from sklearn.neighbors import KNeighborsRegressor

  score_max = 0
  for col in X_columns:
    # split train and test sets
    y = data[y_column]
    X = data[col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.8)
    for k in range(2, 101):
      # Initialisation and training
      kNNRegModel = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
      score = kNNRegModel.score(X_train, y_train)
      if abs(score) > abs(score_max):
        score_max = score
        n_neighbors = k
        variable = i 
  print('Best kNNRegr model with :\n - variable : {} \n - n_neighbors = {} \n - R2 = {} \n'.format(variable, n_neighbors, score_max))

In [32]:
Best_kNNreg_training(data = df_clean, y_column = 'le_OPINION', X_columns = col_list)

ValueError: ignored

### c. Decision Tree

## 4. Try parameters

## 5. Accuracy scores

## 6. Filling missing opinion's values with the best model

## 7. Explaination of the "rules" used by the model to predict the opinion