In [1]:
import math
import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import PowerTransformer, PolynomialFeatures
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# **Data inspection and preprocessing**

In [2]:
# Unpacking data from pickle file and basic information about the data
data = pd.read_pickle('./ass3.pickle')
train, dev, test = data['train'], data['dev'], data['test']

In [3]:
# Show basic data information
print(f"number of features: {len(train.columns) - 1}")
print(f"types of labels: {train['target'].unique()}")
print(f"number of different values: {[(c,len(train[c].unique()))  for c in train]}")
print(f"number of rows in train: {len(train)}")
print(f"number of rows in dev: {len(dev)}")
print(f"number of rows in test: {len(test)}")


number of features: 8
types of labels: [1.695 2.796 1.322 ... 3.508 4.321 2.355]
number of different values: [('f0', 8588), ('f1', 53), ('f2', 11722), ('f3', 9386), ('f4', 3305), ('f5', 11485), ('f6', 799), ('f7', 783), ('target', 3391)]
number of rows in train: 12384
number of rows in dev: 4128
number of rows in test: 4128


In [4]:
#Checking for missing values
train.isnull().sum()

f0        174
f1        140
f2        158
f3        156
f4        169
f5        142
f6        151
f7        148
target      0
dtype: int64

In [5]:
# Show general information about the data (mean, ste, etc. by feature)
train.describe()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,target
count,12210.0,12244.0,12226.0,12228.0,12215.0,12242.0,12233.0,12236.0,12384.0
mean,3.872771,28.630595,5.420978,1.096626,1426.830618,3.144714,35.626833,-119.56104,2.066362
std,1.919183,12.566127,2.382548,0.471398,1103.528284,13.440452,2.133539,1.996646,1.147908
min,0.4999,1.0,0.846154,0.5,3.0,0.692308,32.55,-124.35,0.14999
25%,2.5556,18.0,4.430232,1.006386,786.0,2.428571,33.94,-121.79,1.198
50%,3.5341,29.0,5.218429,1.049202,1170.0,2.816384,34.25,-118.49,1.798
75%,4.745975,37.0,6.043349,1.099202,1739.0,3.276456,37.71,-118.02,2.646
max,15.0001,52.0,132.533333,34.066667,28566.0,1243.333333,41.95,-114.55,5.00001


In [6]:
# Information about the training data - group by label
train.groupby('target').describe()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,f1,f1,...,f6,f6,f7,f7,f7,f7,f7,f7,f7,f7
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0.14999,2.0,1.098350,0.795283,0.5360,0.817175,1.09835,1.379525,1.6607,3.0,34.666667,...,40.010,40.31,3.0,-121.256667,2.949446,-123.17,-122.955,-122.740,-120.30,-117.86
0.17500,1.0,2.366700,,2.3667,2.366700,2.36670,2.366700,2.3667,1.0,39.000000,...,34.150,34.15,1.0,-118.330000,,-118.33,-118.330,-118.330,-118.33,-118.33
0.22500,1.0,1.091800,,1.0918,1.091800,1.09180,1.091800,1.0918,1.0,52.000000,...,32.710,32.71,1.0,-117.160000,,-117.16,-117.160,-117.160,-117.16,-117.16
0.25000,1.0,0.857100,,0.8571,0.857100,0.85710,0.857100,0.8571,1.0,21.000000,...,32.790,32.79,1.0,-114.650000,,-114.65,-114.650,-114.650,-114.65,-114.65
0.27500,1.0,1.265600,,1.2656,1.265600,1.26560,1.265600,1.2656,1.0,17.000000,...,33.920,33.92,1.0,-114.670000,,-114.67,-114.670,-114.670,-114.67,-114.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4.98800,1.0,8.248000,,8.2480,8.248000,8.24800,8.248000,8.2480,1.0,29.000000,...,37.330,37.33,1.0,-122.060000,,-122.06,-122.060,-122.060,-122.06,-122.06
4.99000,1.0,8.148900,,8.1489,8.148900,8.14890,8.148900,8.1489,1.0,18.000000,...,37.890,37.89,1.0,-122.180000,,-122.18,-122.180,-122.180,-122.18,-122.18
4.99100,1.0,6.786100,,6.7861,6.786100,6.78610,6.786100,6.7861,1.0,28.000000,...,33.550,33.55,1.0,-117.770000,,-117.77,-117.770,-117.770,-117.77,-117.77
5.00000,17.0,3.452000,1.224370,1.2656,2.610300,3.37150,4.141700,6.0199,18.0,38.333333,...,37.785,37.80,17.0,-120.200000,2.171532,-122.49,-122.440,-119.690,-118.31,-117.38
