In [1]:
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
#!pip install msgpack-python
import numpy as np
import pandas as pd
#!pip install pickle  --upgrade pip
import pickle
%matplotlib inline

# For loading .arff files
from scipy.io import arff

# To perform mean imputation
from sklearn.preprocessing import Imputer

# Formatted counter of class labels
from collections import Counter
# Ordered Dictionary
from collections import OrderedDict
#To perform kFold Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Library imbalanced-learn to deal with the data imbalance. To use SMOTE oversampling
!pip install imblearn
from imblearn.over_sampling import SMOTE 

# Impoting classification models
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier

import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve



In [2]:
df1, meta1 = arff.loadarff('1year.arff');
df1 = pd.DataFrame(df1);
def set_new_headers(dataframes):
    cols = ['X' + str(i+1) for i in range(len(dataframes.columns)-1)]
    cols.append('Y')
    dataframes.columns = cols
set_new_headers(df1)  

In [3]:
df1.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X56,X57,X58,X59,X60,X61,X62,X63,X64,Y
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.12196,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,b'0'
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.1213,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,b'0'
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.24114,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,b'0'
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.054015,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,b'0'
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.13485,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,b'0'


In [4]:
# Convert the dtypes of all the columns (other than the class label columns) to float.
def convert_columns_type_float(dfs):
    index = 1
    while(index<=63):
        colname = dfs.columns[index]
        col = getattr(dfs, colname)
        dfs[colname] = col.astype(float)
        index+=1

convert_columns_type_float(df1)


# The class labels for all the dataframes are originally in object type.
# Convert them to int types
def convert_class_label_type_int(dfs):
    col = getattr(dfs, 'Y')
    dfs['Y'] = col.astype(int)
        
convert_class_label_type_int(df1)

In [5]:
# Get Clean dataframes by dropping all the rows which have missing values
def drop_nan_rows(dataframes, verbose=False):
    clean_dataframes = dataframes.dropna(axis=0, how='any')
    if verbose:
        print('Original Length=', len(dataframes), '\tCleaned Length=', len(clean_dataframes), '\tMissing Data=', len(dataframes)-len(clean_dataframes))
    return clean_dataframes

# Doing a quick analysis of how many missing values are there in each of the 5 dataframes
nan_dropped_dataframes = drop_nan_rows(df1, verbose=True)

Original Length= 7027 	Cleaned Length= 3194 	Missing Data= 3833


In [6]:
def perform_mean_imputation(dfs):
    # Construct an imputer with strategy as 'mean', to mean-impute along the columns
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    mean_imputed_dfs = pd.DataFrame(imputer.fit_transform(dfs))
    mean_imputed_dfs.columns = dfs.columns   
    return mean_imputed_dfs

mean_imputed_dataframes = perform_mean_imputation(df1)

In [7]:
def check_data_imbalance(dfs):
    print('Dataset: '+str(1)+'styear')
    print(dfs.groupby('Y').size())
    minority_percent = (dfs['Y'].tolist().count(1) / len(dfs['Y'].tolist()))*100
    print('Minority (label 1) percentage: '+  str(minority_percent) + '%')
    print('-'*64)
        
check_data_imbalance(df1)

Dataset: 1styear
Y
0    6756
1     271
dtype: int64
Minority (label 1) percentage: 3.856553294435748%
----------------------------------------------------------------


In [8]:
# Split the features and labels into separate dataframes for all the original dataframes
def split_dataframes_features_labels(dfs):
    feature_dfs = dfs.iloc[:,0:64] 
    label_dfs = dfs.iloc[:,64]
    return feature_dfs, label_dfs

In [9]:
feature,label = split_dataframes_features_labels(mean_imputed_dataframes)
m1 = SMOTE()
x,y = m1.fit_sample(feature,label)
x = pd.DataFrame(x)
y = pd.DataFrame(y)
data = pd.concat([x,y],axis = 1)
set_new_headers(data)
data_feature = data.iloc[:,0:64] 
data_label = data.iloc[:,64:]
data_feature_train,data_feature_test,data_label_train,data_label_test = train_test_split(data_feature,data_label,test_size = .3,random_state = 0)

In [10]:
# Balanced Bagging Classifier
bb_classifier = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(criterion='entropy'), n_estimators = 10, bootstrap = True)


In [20]:
y = bb_classifier.fit(data_feature,data_label)
predicted_data = y.predict(data_feature)
predicted_data = pd.DataFrame(predicted_data)
predicted_data.to_csv('predicted_data.csv',sep = ',')

In [21]:
filename = 'finalized_model.sav'
pickle.dump(bb_classifier, open(filename, 'wb'))

In [23]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(data_feature, data_label)
print(result)

0.9998519834221433
