In [None]:
!pip3 install lightgbm

In [85]:
import os
import time

import lightgbm as lightgbm
import pandas as pd
import numpy as np
import xgboost as xgboost
from sklearn import model_selection
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, AdaBoostClassifier, \
    BaggingClassifier, ExtraTreesClassifier
from sklearn.feature_selection import RFECV
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Lasso, ElasticNet, LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from boruta import BorutaPy
from sklearn.linear_model import LogisticRegression
import pickle
from boostaroota import BoostARoota
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier

DATA_PATH = '.'
# INPUT_DATA_TRAIN = os.path.join(DATA_PATH, 'input.csv')
# OUTPUT_DATA_TRAIN = os.path.join(DATA_PATH, 'output.csv')

input_df = pickle.load(open( "./full_train.pkl", "rb" ))

train_df = pickle.load(open( "./full_train_10C.pkl", "rb" ))
test_df = pickle.load(open( "./full_test_10C.pkl", "rb" ))



dict_names = {
    'B00300S' : 'Air-temperature',
    'B00305A' :	'Soil-temperature',
    'B00202A' : 'Wind-direction',
    'B00702A' : 'Avg-wind-speed-past-10-mts',
    'B00703A' : 'Max-speed',
    'B00608S' : 'Precipitation-sum-past-10-mts',
    'B00604S' : 'Precipitation-sum-past-24-hrs',
    'B00606S' : 'Precipitation-sum-past-1-hrs',
    'B00802A' : 'Relative-humidity',
    'B00714A' : 'Max-wind-past-10-mts',
    'B00910A' : 'Water-in-snow'
}

def fix_column_names(df, d):
    rename_d = {}
    for column in df.columns:
        column_list = column.split("_")
        station_name = column_list[0]
        coded_name = column_list[1]
        suffix = coded_name
        if coded_name in d:
            suffix = d[coded_name]
        rename_d[station_name+'_'+coded_name] = station_name+"_"+suffix
    df.rename(columns=rename_d,inplace=True)
    return df

def timing(f):
    def wrap(*args):
        time1 = time.time()
        ret = f(*args)
        time2 = time.time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2 - time1) * 1000.0))
        return ret

    return wrap

def prepare_data(data):
    # data = data.drop(columns=['data_'])
    y=None
    if 'is_rain' in data:
        y = data['is_rain']
        X = data.drop(columns=['is_rain'])
    else:
        X = data
    print(X.isnull().values.any())
    X.fillna(method='ffill', inplace=True)
    X.fillna(method='bfill', inplace=True)
    print(X.isnull().values.any())
    X = fix_column_names(X, dict_names)
    return X, y

def prepare_undersample_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)

    y_with_one_indices = y_train.index[y_train == 1].tolist()
    y_with_zero_indices = y_train.index[y_train == 0].tolist()

    chosen_zeros = np.random.choice(y_with_zero_indices, len(y_with_one_indices), replace=False).tolist()

    idx = y_with_one_indices + chosen_zeros
    idx.sort()

    X_train = X_train.loc[idx]
    y_train = y_train.loc[idx]
    
    X_train.fillna(method='ffill', inplace=True)
    X_train.fillna(method='bfill', inplace=True)

    print('rf')
    X_train_selected, selected_features = rf(X_train, y_train)
    # X_train_selected, selected_features = rfe(X_train, y_train)
    X_test_selected = X_test[selected_features]

    x_standard_scaler = StandardScaler()
    X_train_selected_transformed = x_standard_scaler.fit_transform(X_train_selected)
    X_test_selected_transformed = x_standard_scaler.transform(X_test_selected)

    y_label_encoder = LabelEncoder()
    y_train_encoded = y_label_encoder.fit_transform(y_train)
    y_test_encoded = y_label_encoder.transform(y_test)

    return X_train_selected_transformed, X_test_selected_transformed, y_train_encoded, y_test_encoded

def prepare_oversample_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)

    y_with_one_indices = y_train.index[y_train == 1].tolist()
    y_with_zero_indices = y_train.index[y_train == 0].tolist()

    chosen_ones = np.random.choice(y_with_one_indices, len(y_with_zero_indices), replace=True).tolist()

    idx = y_with_zero_indices + chosen_ones
    idx.sort()

    X_train = X_train.loc[idx]
    y_train = y_train.loc[idx]

    print('rf')
    X_train_selected, selected_features = rf(X_train, y_train)
    # X_train_selected, selected_features = rfe(X_train, y_train)
    X_test_selected = X_test[selected_features]

    x_standard_scaler = StandardScaler()
    X_train_selected_transformed = x_standard_scaler.fit_transform(X_train_selected)
    X_test_selected_transformed = x_standard_scaler.transform(X_test_selected)

    y_label_encoder = LabelEncoder()
    y_train_encoded = y_label_encoder.fit_transform(y_train)
    y_test_encoded = y_label_encoder.transform(y_test)

    return X_train_selected_transformed, X_test_selected_transformed, y_train_encoded, y_test_encoded


# def divide_train_test(X,y):
def boostaroota(df_x, df_y):
    br = BoostARoota(metric='f1')
    br.fit(df_x, df_y)
    df_x_selected = br.fit_transform(df_x, df_y)
    return df_x_selected

@timing
def boruta(df_x, df_y):
    columns = list(df_x.columns)
    df_x = df_x.values
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
    df_x_selected = feat_selector.fit_transform(df_x, df_y)
    selected_cols = []
    for (f,i) in zip(columns, feat_selector.support_):
        if i == True:
            selected_cols.append(f)
    return df_x_selected, selected_cols#, list(df_x_selected.columns)

@timing
def rfe(df_x, df_y):
    columns = list(df_x.columns)
    rfc = RandomForestClassifier(random_state=101)
    rfecv = RFECV(estimator=rfc, step=3,
                     cv=StratifiedKFold(2), scoring='f1_weighted')
    # rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(2), scoring="f1_weighted")
    df_x_selected = rfecv.fit_transform(df_x, df_y)
    print(df_x_selected.shape)
    selected_cols = []
    for (f, i) in zip(columns, rfecv.support_):
        if i == True:
            selected_cols.append(f)
    return df_x_selected, selected_cols #, list(df_x_selected.columns)

@timing
def rf(df_x, df_y):
    rf = RandomForestClassifier(n_jobs=-1, random_state=1, n_estimators=20)
    rf.fit(df_x, df_y)
    names = df_x.columns
    feat_imp = rf.feature_importances_
    print ("Features sorted by their score:")
    print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),
           reverse=True))
    print(feat_imp)
    feats_selected = df_x.columns[feat_imp > 0.01]
    # print(df_x_selected.shape)
    return df_x[feats_selected], feats_selected #, list(df_x_selected.columns)

# class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
#     def __init__(self, models):
#         self.models_ = models
#
#     def fit(self, X, y):
#         for model in self.models_:
#             model.fit(X, y)
#
#     def predict(self, X):
#         predictions = np.column_stack([
#             model.predict(X) for model in self.models_
#         ])
#         return np.mean(predictions, axis=1)

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        # self.models_ = models
        self.voting_classifier = VotingClassifier(estimators=[models], voting='hard')

    def fit(self, X, y):
        self.voting_classifier.fit(X, y)

    def predict(self, X):
        return self.voting_classifier.predict(X)

    # > eclf3 = VotingClassifier(estimators=[
    #     ...('lr', clf1), ('rf', clf2), ('gnb', clf3)],
    #     ...
    # voting = 'soft', weights = [2, 1, 1],
    #                            ...
    # flatten_transform = True)
    # >> > eclf3 = eclf3.fit(X, y)
    # >> > print(eclf3.predict(X))

def return_models():
        models = []

        ### random forest model
#         models.append( ('rf', RandomForestClassifier(max_depth=5, random_state=42, n_estimators=5,
#                                          verbose=0) ) ) # Train the model on training data
        models.append( ('knn', KNeighborsClassifier(3)) )

#         models.append(('svc',SVC(kernel="linear", C=0.025)))
#         models.append(('dt', DecisionTreeClassifier(max_depth=5)))
#         models.append(('ada', AdaBoostClassifier()))
#         models.append(('gaussian', GaussianNB()))
#         # models.append(('nusvc', NuSVC()))
#         models.append(('linear_svc', LinearSVC(max_iter=10000)))
#         models.append(('sgd_classifier', SGDClassifier()))
#         models.append(('logisit_reg',LogisticRegressionCV(max_iter=10000)))
#         models.append(('bagging_classifier', BaggingClassifier()))
#         models.append(('extra_trees', ExtraTreesClassifier()))

        return models


#####################

# X = pd.read_csv('./examples/test_X', index_col=0)
# y = pd.read_csv('./examples/test_y', header=None, index_col=0)
# y = y.ravel()

# X, y = pickle.load(open( "./train_2C_simulation_x.pkl", "rb" )),pickle.load(open( "./train_y.pkl", "rb" )) # pickle.load(open( "./train_2C_simulation_x.pkl", "rb" ))
# X = np.nan_to_num(X)
# X_train, X_test, y_train, y_test = prepare_undersample_train_test(X, y)

# x_standard_scaler = StandardScaler()
# X_train_selected_transformed = x_standard_scaler.fit_transform(X_train)
# X_test_selected_transformed = x_standard_scaler.transform(X_test)

# y_label_encoder = LabelEncoder()
# y_train_encoded = y_label_encoder.fit_transform(y_train)
# # y_test_encoded = y_label_encoder.transform(y_test)

# # X_train, X_test, y_train, y_test = prepare_train_test(X,y)
# models = return_models()
# voting_classifier = VotingClassifier(estimators=models, voting='hard')
# voting_classifier.fit(X_train, y_train)
# cv = model_selection.cross_val_score(voting_classifier, X_train, y_train,cv=10, scoring='f1')
# print(cv)
# print(np.mean(cv))
# y_pred = voting_classifier.predict(X_test)
# print('f1 score {}'.format(f1_score(y_test, y_pred, zero_division=1)))

In [86]:
X, y = pickle.load(open( "./train_2C_simulation_x.pkl", "rb" )),pickle.load(open( "./train_y.pkl", "rb" )) # pickle.load(open( "./train_2C_simulation_x.pkl", "rb" ))


In [88]:
X_train, X_test, y_train, y_test = prepare_undersample_train_test(X, y)

x_standard_scaler = StandardScaler()
X_train_selected_transformed = x_standard_scaler.fit_transform(X_train)
X_test_selected_transformed = x_standard_scaler.transform(X_test)

y_label_encoder = LabelEncoder()
y_train_encoded = y_label_encoder.fit_transform(y_train)
# y_test_encoded = y_label_encoder.transform(y_test)

# X_train, X_test, y_train, y_test = prepare_train_test(X,y)
models = return_models()
voting_classifier = VotingClassifier(estimators=models, voting='hard')
voting_classifier.fit(X_train, y_train)
cv = model_selection.cross_val_score(voting_classifier, X_train, y_train,cv=10, scoring='f1')
print(cv)
print(np.mean(cv))
y_pred = voting_classifier.predict(X_test)
print('f1 score {}'.format(f1_score(y_test, y_pred, zero_division=1)))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  if sys.path[0] == '':


True
rf
Features sorted by their score:
[(0.0766, '00_012_39'), (0.0437, '00_000_37'), (0.0409, '00_018_12'), (0.0393, '00_012_9'), (0.0387, '00_024_34'), (0.0383, '00_024_4'), (0.038, '00_000_39'), (0.0373, '00_030_9'), (0.0368, '00_030_16'), (0.0368, '00_006_34'), (0.0362, '00_015_35'), (0.036, '00_021_39'), (0.0359, '00_027_4'), (0.0357, '00_030_17'), (0.0356, '00_030_13'), (0.0348, '00_015_40'), (0.0343, '00_006_44'), (0.0299, '00_021_34'), (0.0219, '00_027_45'), (0.0171, '00_015_32'), (0.0161, '00_003_38'), (0.0119, '353200272_Avg-wind-speed-past-10-mts'), (0.0101, '00_024_16'), (0.0101, '00_015_11'), (0.0094, '00_009_10'), (0.009, '00_003_7'), (0.0088, '00_030_37'), (0.0084, '00_012_30'), (0.0082, '353200272_Air-temperature'), (0.0082, '00_012_20'), (0.0077, '253200080_Precipitation-sum-past-24-hrs'), (0.0075, '00_018_44'), (0.0074, '00_030_20'), (0.0072, '00_009_28'), (0.007, '00_030_21'), (0.0064, '253200080_Precipitation-sum-past-1-hrs'), (0.006, '00_003_47'), (0.0057, '00_003

In [80]:
y_train.shape

(100,)