In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from Preparation import prepare_to_file, prepare_and_return
from Scoring import score, initialize_result_file
from Scenarios import drop_columns,remove_missing,fill_missing_mode,fill_missing_max,fill_missing_min,fill_missing_mean,fill_missing_regression,fill_missing_zero,standardize,normalize,remove_outliers_lof,encode_categorical

### Import zestawu danych

In [20]:
aus_weather_1 = pd.read_csv('../Datasets/Prepared/aus_weather_1.csv')
aus_weather_2 = pd.read_csv('../Datasets/Prepared/aus_weather_2.csv')
aus_weather_3 = pd.read_csv('../Datasets/Prepared/aus_weather_3.csv')

In [21]:
aus_weather_1.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [22]:
categorical = ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
numeric = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm']
to_be_encoded = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [23]:
def no_preprocessing(df,num):
    df_1 = df.copy()
    df_1 = remove_missing(df_1)
    y = df_1['RainTomorrow']
    df_1= drop_columns(df_1,categorical)
    df_1 = df_1.apply(pd.to_numeric)
    X = df_1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Brak","Brak przygotowania",X_train,y_train,X_test,y_test)

In [24]:
def fill_mean(df, num):
    df_2 = df.copy()
    df_2 = fill_missing_mean(df_2,numeric)
    y = df_2['RainTomorrow']
    df_2 = drop_columns(df_2,categorical)
    df_2 = df_2.apply(pd.to_numeric)
    X = df_2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Wypełnienie_brakujących","Wypełnienie średnią",X_train,y_train,X_test,y_test)

In [25]:
def fill_min(df,num):
    df_3 = df.copy()
    df_3 = fill_missing_min(df_3,numeric)
    y = df_3['RainTomorrow']
    df_3 = df_3.drop(categorical,axis=1)
    df_3 = df_3.apply(pd.to_numeric)
    X = df_3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Wypełnienie_brakujących","Wypełnienie minimum",X_train,y_train,X_test,y_test)

In [26]:
def fill_max(df,num):
    df_4 = df.copy()
    df_4 = fill_missing_max(df_4,numeric)
    y = df_4['RainTomorrow']
    df_4 = df_4.drop(categorical,axis=1)
    df_4 = df_4.apply(pd.to_numeric)
    X = df_4
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Wypełnienie_brakujących","Wypełnienie maksimum",X_train,y_train,X_test,y_test)

In [27]:
def fill_regression(df,num):
    df_new = df.copy()
    df_new = fill_missing_regression(df_new, numeric)
    y = df_new['RainTomorrow']
    df_new = df_new.drop(categorical,axis=1)
    df_new = df_new.apply(pd.to_numeric)
    X = df_new
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Wypełnienie_brakujących","Wypełnienie regresją",X_train,y_train,X_test,y_test)

In [28]:
def standardize_scenario(df,num):
    df_5 = df.copy()
    df_5 = fill_missing_mean(df_5,numeric)
    df_5 = standardize(df_5,numeric)
    y = df_5['RainTomorrow']
    df_5 = df_5.drop(categorical,axis=1)
    df_5 = df_5.apply(pd.to_numeric)
    X = df_5
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Standaryzacja","Standaryzacja",X_train,y_train,X_test,y_test)

In [29]:
def normalize_scenario(df,num):
    df_6 = df.copy()
    df_6 = fill_missing_mean(df_6,numeric)
    df_6 = normalize(df_6,numeric)
    y = df_6['RainTomorrow']
    df_6 = df_6.drop(categorical,axis=1)
    df_6 = df_6.apply(pd.to_numeric)
    X = df_6
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Standaryzacja","Skalowanie do (0-1)",X_train,y_train,X_test,y_test)

In [30]:
def normalize_and_remove_outliers(df,num):
    df_7 = df.copy()
    df_7 = fill_missing_mean(df_7,numeric)
    df_7 = normalize(df_7,numeric)
    df_7 = remove_outliers_lof(df_7,numeric)
    y = df_7['RainTomorrow']
    df_7 = df_7.drop(categorical,axis=1)
    df_7 = df_7.apply(pd.to_numeric)
    X = df_7
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Standaryzacja","Skalowanie (0-1) + usuw. odstających",X_train,y_train,X_test,y_test)

In [31]:
def encode_categorical_scenario(df,num):
    df_8 = df.copy()
    df_8 = remove_missing(df_8)
    df_8 = encode_categorical(df_8,to_be_encoded)
    y = df_8['RainTomorrow']
    df_8= drop_columns(df_8,['Date'])
    df_8 = df_8.apply(pd.to_numeric)
    X = df_8
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Kodowanie","Kodowanie wartości kategorycznych",X_train,y_train,X_test,y_test)

In [32]:
def encode_categorical_and_fill_missing(df,num):
    df_9 = df.copy()
    df_9 = fill_missing_mean(df_9,numeric)
    df_9 = encode_categorical(df_9,to_be_encoded)
    y = df_9['RainTomorrow']
    df_9= drop_columns(df_9,['Date'])
    df_9 = df_9.apply(pd.to_numeric)
    X = df_9
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Kodowanie","Kod. war. kategorycznych + wyp. brak. średnią",X_train,y_train,X_test,y_test)

In [33]:
def custom_scenario(df,num):
    df_10 = df.copy()
    df_10 = fill_missing_mean(df_10,numeric)
    df_10 = remove_outliers_lof(df_10,numeric)
    df_10 = normalize(df_10,numeric)
    # Extract day, month and year from date
    df_10['Year'] = pd.DatetimeIndex(df_10['Date']).year
    df_10['Month'] = pd.DatetimeIndex(df_10['Date']).month
    df_10['Day'] = pd.DatetimeIndex(df_10['Date']).day
    df_10= drop_columns(df_10,['Date'])
    to_be_encoded = [
        'Location',
        'WindGustDir',
        'WindDir9am',
        'WindDir3pm',
        'RainToday',
        'Day',
        'Month',
        'Year']
    df_10 = encode_categorical(df_10,to_be_encoded)
    y = df_10['RainTomorrow']
    df_10 = df_10.apply(pd.to_numeric)
    X = df_10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Custom","Custom preprocessing",X_train,y_train,X_test,y_test)

In [34]:
num = 1
for df_i in [aus_weather_1,aus_weather_2,aus_weather_3]:
    no_preprocessing(df_i,num)
    fill_mean(df_i, num)
    fill_min(df_i,num)
    fill_max(df_i,num)
    fill_regression(df_i,num)
    num = num + 1

Scenario: Brak przygotowania
Xgboost: 0.8554054054054054
Random Forest Classifier: 0.8493243243243244
KNeighbors Classifier: 0.8364864864864865
Scenario: Wypełnienie średnią
Xgboost: 0.8297872340425532
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8206686930091185
Scenario: Wypełnienie minimum
Xgboost: 0.8346504559270517
Random Forest Classifier: 0.819452887537994
KNeighbors Classifier: 0.8158054711246201
Scenario: Wypełnienie maksimum
Xgboost: 0.8340425531914893
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8133738601823708


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Wypełnienie regresją
Xgboost: 0.8303951367781155
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8224924012158055
Scenario: Brak przygotowania
Xgboost: 0.8513513513513513
Random Forest Classifier: 0.8358108108108108
KNeighbors Classifier: 0.8472972972972973
Scenario: Wypełnienie średnią
Xgboost: 0.827355623100304
Random Forest Classifier: 0.819452887537994
KNeighbors Classifier: 0.8170212765957446
Scenario: Wypełnienie minimum
Xgboost: 0.8370820668693009
Random Forest Classifier: 0.8212765957446808
KNeighbors Classifier: 0.8224924012158055
Scenario: Wypełnienie maksimum
Xgboost: 0.8340425531914893
Random Forest Classifier: 0.8188449848024316
KNeighbors Classifier: 0.8170212765957446


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Wypełnienie regresją
Xgboost: 0.8395136778115502
Random Forest Classifier: 0.8218844984802431
KNeighbors Classifier: 0.8212765957446808
Scenario: Brak przygotowania
Xgboost: 0.8425675675675676
Random Forest Classifier: 0.8304054054054054
KNeighbors Classifier: 0.8297297297297297
Scenario: Wypełnienie średnią
Xgboost: 0.8340425531914893
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8224924012158055
Scenario: Wypełnienie minimum
Xgboost: 0.841337386018237
Random Forest Classifier: 0.8212765957446808
KNeighbors Classifier: 0.8182370820668693
Scenario: Wypełnienie maksimum
Xgboost: 0.8291793313069908
Random Forest Classifier: 0.8212765957446808
KNeighbors Classifier: 0.8206686930091185


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Wypełnienie regresją
Xgboost: 0.8346504559270517
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8231003039513678


In [35]:
num = 1
for df_i in [aus_weather_1,aus_weather_2,aus_weather_3]:
    standardize_scenario(df_i,num)
    normalize_scenario(df_i,num)
    normalize_and_remove_outliers(df_i,num)
    num = num + 1

Scenario: Standaryzacja
Xgboost: 0.8297872340425532
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8249240121580547
Scenario: Skalowanie do (0-1)
Xgboost: 0.8297872340425532
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8127659574468085
Scenario: Skalowanie do (0-1) z usuwaniem odstających
Xgboost: 0.838909541511772
Random Forest Classifier: 0.8252788104089219
KNeighbors Classifier: 0.8258983890954151
Scenario: Standaryzacja
Xgboost: 0.827355623100304
Random Forest Classifier: 0.819452887537994
KNeighbors Classifier: 0.8212765957446808
Scenario: Skalowanie do (0-1)
Xgboost: 0.827355623100304
Random Forest Classifier: 0.819452887537994
KNeighbors Classifier: 0.8127659574468085
Scenario: Skalowanie do (0-1) z usuwaniem odstających
Xgboost: 0.8378712871287128
Random Forest Classifier: 0.8298267326732673
KNeighbors Classifier: 0.8261138613861386
Scenario: Standaryzacja
Xgboost: 0.8340425531914893
Random Forest Classifier: 0.8206686930091185
KN

In [36]:
num = 1
for df_i in [aus_weather_1,aus_weather_2,aus_weather_3]:
    encode_categorical_scenario(df_i,num)
    encode_categorical_and_fill_missing(df_i,num)
    custom_scenario(df_i,num)
    num = num + 1

Scenario: Kodowanie wartości kategorycznych
Xgboost: 1.0
Random Forest Classifier: 0.9608108108108108
KNeighbors Classifier: 0.8472972972972973
Scenario: Kodowanie wartości kategorycznych + wypełnienie średnią
Xgboost: 1.0
Random Forest Classifier: 0.9440729483282675
KNeighbors Classifier: 0.825531914893617
Scenario: Custom preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.9845869297163995
KNeighbors Classifier: 0.7885326757090012
Scenario: Kodowanie wartości kategorycznych
Xgboost: 1.0
Random Forest Classifier: 0.9554054054054054
KNeighbors Classifier: 0.8466216216216216
Scenario: Kodowanie wartości kategorycznych + wypełnienie średnią
Xgboost: 1.0
Random Forest Classifier: 0.9416413373860182
KNeighbors Classifier: 0.8267477203647416
Scenario: Custom preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.9839704069050554
KNeighbors Classifier: 0.7996300863131935
Scenario: Kodowanie wartości kategorycznych
Xgboost: 1.0
Random Forest Classifier: 0.9385135135135135
KNeighbors Classi