In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from Preparation import prepare_to_file, prepare_and_return
from Scoring import score, initialize_result_file
from Scenarios import drop_columns,remove_missing,fill_missing_mode,fill_missing_max,fill_missing_min,fill_missing_mean,fill_missing_regression,fill_missing_zero,standardize,normalize,remove_outliers_lof,encode_categorical

### Import zestawu danych

In [5]:
aus_weather_1 = pd.read_csv('../Datasets/Prepared/aus_weather_1.csv')
aus_weather_2 = pd.read_csv('../Datasets/Prepared/aus_weather_2.csv')
aus_weather_3 = pd.read_csv('../Datasets/Prepared/aus_weather_3.csv')

In [6]:
aus_weather_1.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [7]:
categorical = ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
numeric = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm']
to_be_encoded = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [8]:
def no_preprocessing(df,num):
    df_1 = df.copy()
    df_1 = remove_missing(df_1)
    y = df_1['RainTomorrow']
    df_1= drop_columns(df_1,categorical)
    df_1 = df_1.apply(pd.to_numeric)
    X = df_1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"No Preprocessing",X_train,y_train,X_test,y_test)

In [9]:
def fill_mean(df, num):
    df_2 = df.copy()
    df_2 = fill_missing_mean(df_2,numeric)
    y = df_2['RainTomorrow']
    df_2 = drop_columns(df_2,categorical)
    df_2 = df_2.apply(pd.to_numeric)
    X = df_2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Fill Missing with mean",X_train,y_train,X_test,y_test)

In [10]:
def fill_min(df,num):
    df_3 = df.copy()
    df_3 = fill_missing_min(df_3,numeric)
    y = df_3['RainTomorrow']
    df_3 = df_3.drop(categorical,axis=1)
    df_3 = df_3.apply(pd.to_numeric)
    X = df_3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Fill_missing_with_min",X_train,y_train,X_test,y_test)

In [11]:
def fill_max(df,num):
    df_4 = df.copy()
    df_4 = fill_missing_max(df_4,numeric)
    y = df_4['RainTomorrow']
    df_4 = df_4.drop(categorical,axis=1)
    df_4 = df_4.apply(pd.to_numeric)
    X = df_4
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Fill_missing_with_max",X_train,y_train,X_test,y_test)

In [12]:
def fill_regression(df,num):
    df_new = df.copy()
    df_new = fill_missing_regression(df_new, numeric)
    y = df_new['RainTomorrow']
    df_new = df_new.drop(categorical,axis=1)
    df_new = df_new.apply(pd.to_numeric)
    X = df_new
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Regression",X_train,y_train,X_test,y_test)

In [13]:
def standardize_scenario(df,num):
    df_5 = df.copy()
    df_5 = fill_missing_mean(df_5,numeric)
    df_5 = standardize(df_5,numeric)
    y = df_5['RainTomorrow']
    df_5 = df_5.drop(categorical,axis=1)
    df_5 = df_5.apply(pd.to_numeric)
    X = df_5
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Standardize",X_train,y_train,X_test,y_test)

In [14]:
def normalize_scenario(df,num):
    df_6 = df.copy()
    df_6 = fill_missing_mean(df_6,numeric)
    df_6 = normalize(df_6,numeric)
    y = df_6['RainTomorrow']
    df_6 = df_6.drop(categorical,axis=1)
    df_6 = df_6.apply(pd.to_numeric)
    X = df_6
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Normalizacja",X_train,y_train,X_test,y_test)

In [15]:
def normalize_and_remove_outliers(df,num):
    df_7 = df.copy()
    df_7 = fill_missing_mean(df_7,numeric)
    df_7 = normalize(df_7,numeric)
    df_7 = remove_outliers_lof(df_7,numeric)
    y = df_7['RainTomorrow']
    df_7 = df_7.drop(categorical,axis=1)
    df_7 = df_7.apply(pd.to_numeric)
    X = df_7
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Normalization_with_LOF",X_train,y_train,X_test,y_test)

In [16]:
def encode_categorical_scenario(df,num):
    df_8 = df.copy()
    df_8 = remove_missing(df_8)
    df_8 = encode_categorical(df_8,to_be_encoded)
    y = df_8['RainTomorrow']
    df_8= drop_columns(df_8,['Date'])
    df_8 = df_8.apply(pd.to_numeric)
    X = df_8
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Label_Encoding",X_train,y_train,X_test,y_test)

In [17]:
def encode_categorical_and_fill_missing(df,num):
    df_9 = df.copy()
    df_9 = fill_missing_mean(df_9,numeric)
    df_9 = encode_categorical(df_9,to_be_encoded)
    y = df_9['RainTomorrow']
    df_9= drop_columns(df_9,['Date'])
    df_9 = df_9.apply(pd.to_numeric)
    X = df_9
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Label_encoding_+_fill_missing_mean",X_train,y_train,X_test,y_test)

In [23]:
def custom_scenario(df,num):
    df_10 = df.copy()
    df_10 = fill_missing_mean(df_10,numeric)
    df_10 = remove_outliers_lof(df_10,numeric)
    df_10 = normalize(df_10,numeric)
    # Extract day, month and year from date
    df_10['Year'] = pd.DatetimeIndex(df_10['Date']).year
    df_10['Month'] = pd.DatetimeIndex(df_10['Date']).month
    df_10['Day'] = pd.DatetimeIndex(df_10['Date']).day
    df_10= drop_columns(df_10,['Date'])
    to_be_encoded = [
        'Location',
        'WindGustDir',
        'WindDir9am',
        'WindDir3pm',
        'RainToday',
        'Day',
        'Month',
        'Year']
    df_10 = encode_categorical(df_10,to_be_encoded)
    y = df_10['RainTomorrow']
    df_10 = df_10.apply(pd.to_numeric)
    X = df_10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("Aus_weather",num,"Custom_preprocessing",X_train,y_train,X_test,y_test)

In [19]:
num = 1
for df_i in [aus_weather_1,aus_weather_2,aus_weather_3]:
    no_preprocessing(df_i,num)
    fill_mean(df_i, num)
    fill_min(df_i,num)
    fill_max(df_i,num)
    fill_regression(df_i,num)
    num = num + 1

Scenario: No Preprocessing
Xgboost: 0.8385135135135136
Random Forest Classifier: 0.831081081081081
KNeighbors Classifier: 0.8405405405405405
Scenario: Fill Missing with mean
Xgboost: 0.8316109422492401
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8212765957446808
Scenario: Fill_missing_with_min
Xgboost: 0.833434650455927
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8267477203647416
Scenario: Fill_missing_with_max
Xgboost: 0.8382978723404255
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8218844984802431


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Regression
Xgboost: 0.8370820668693009
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8200607902735563
Scenario: No Preprocessing
Xgboost: 0.8398648648648649
Random Forest Classifier: 0.8216216216216217
KNeighbors Classifier: 0.8243243243243243
Scenario: Fill Missing with mean
Xgboost: 0.8328267477203647
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8200607902735563
Scenario: Fill_missing_with_min
Xgboost: 0.835258358662614
Random Forest Classifier: 0.8200607902735563
KNeighbors Classifier: 0.817629179331307
Scenario: Fill_missing_with_max
Xgboost: 0.827355623100304
Random Forest Classifier: 0.8182370820668693
KNeighbors Classifier: 0.8200607902735563


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Regression
Xgboost: 0.8285714285714286
Random Forest Classifier: 0.8200607902735563
KNeighbors Classifier: 0.8206686930091185
Scenario: No Preprocessing
Xgboost: 0.8337837837837838
Random Forest Classifier: 0.8148648648648649
KNeighbors Classifier: 0.8222972972972973
Scenario: Fill Missing with mean
Xgboost: 0.8322188449848025
Random Forest Classifier: 0.8200607902735563
KNeighbors Classifier: 0.8237082066869301
Scenario: Fill_missing_with_min
Xgboost: 0.8297872340425532
Random Forest Classifier: 0.819452887537994
KNeighbors Classifier: 0.8170212765957446
Scenario: Fill_missing_with_max
Xgboost: 0.8322188449848025
Random Forest Classifier: 0.819452887537994
KNeighbors Classifier: 0.8182370820668693


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Regression
Xgboost: 0.8310030395136778
Random Forest Classifier: 0.8218844984802431
KNeighbors Classifier: 0.819452887537994


In [20]:
num = 1
for df_i in [aus_weather_1,aus_weather_2,aus_weather_3]:
    standardize_scenario(df_i,num)
    normalize_scenario(df_i,num)
    normalize_and_remove_outliers(df_i,num)
    num = num + 1

Scenario: Standardize
Xgboost: 0.8316109422492401
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8237082066869301
Scenario: Normalizacja
Xgboost: 0.8316109422492401
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8127659574468085
Scenario: Normalization_with_LOF
Xgboost: 0.8432465923172243
Random Forest Classifier: 0.8296158612143743
KNeighbors Classifier: 0.8234200743494424
Scenario: Standardize
Xgboost: 0.8328267477203647
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.819452887537994
Scenario: Normalizacja
Xgboost: 0.8328267477203647
Random Forest Classifier: 0.8206686930091185
KNeighbors Classifier: 0.8097264437689969
Scenario: Normalization_with_LOF
Xgboost: 0.855638166047088
Random Forest Classifier: 0.8283767038413878
KNeighbors Classifier: 0.8258983890954151
Scenario: Standardize
Xgboost: 0.8322188449848025
Random Forest Classifier: 0.8200607902735563
KNeighbors Classifier: 0.8267477203647416
Scenario: Normaliza

In [24]:
num = 1
for df_i in [aus_weather_1,aus_weather_2,aus_weather_3]:
    encode_categorical_scenario(df_i,num)
    encode_categorical_and_fill_missing(df_i,num)
    custom_scenario(df_i,num)
    num = num + 1

Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 0.9466216216216217
KNeighbors Classifier: 0.8405405405405405
Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Classifier: 0.9440729483282675
KNeighbors Classifier: 0.8206686930091185
Scenario: Custom_preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.9814814814814815
KNeighbors Classifier: 0.7913580246913581
Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 0.9425675675675675
KNeighbors Classifier: 0.8243243243243243
Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Classifier: 0.9428571428571428
KNeighbors Classifier: 0.8267477203647416
Scenario: Custom_preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.9783950617283951
KNeighbors Classifier: 0.8055555555555556
Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 0.9358108108108109
KNeighbors Classifier: 0.831081081081081
Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Cl