-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
96 lines (72 loc) · 2.78 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import csv
from timeit import default_timer as timer
import numpy as np
from sklearn import preprocessing
# reads the csv file containing the dataset
# returns the dataset in a list of rows (without salary column), and the salary column
def readCSV(fileDataset, delimiter, nameColSalary):
start = timer()
x = []
y = []
with open(fileDataset) as csvfile:
reader = csv.reader(csvfile, delimiter=delimiter)
xLabels = next(reader)
salaryIndex = xLabels.index(nameColSalary)
xLabels.pop(salaryIndex)
for row in reader:
try:
salary = float(row.pop(salaryIndex))
x.append(row)
y.append(salary)
except ValueError:
None
end = timer()
print("csv reading time: ", (end-start))
return x, y
# removes columns from the dataset with too much non available values
# returns the dataset filtered
def filterColumns(x, nonAvailableValue, acceptedNA):
start = timer()
xTransp = np.transpose(x)
xReturn = []
for col in xTransp:
if not containsTooMuchNA(col, nonAvailableValue, acceptedNA):
xReturn.append(col)
xReturn = np.transpose(xReturn)
end = timer()
print("filtering columns time: ", (end - start))
return xReturn
# given a column, value of non available values, and maximum accepted non available,
# returns True if column has enough valid values, otherwise False
def containsTooMuchNA(column, nonAvailableValue, acceptedNA):
numNA = 0
for cell in column:
if cell == nonAvailableValue:
numNA += 1
return (numNA / len(column)) > acceptedNA
# given the dataset with string values, transforms it into a dataset with integer values
# returns the dataset transformed
def matrixFromStringsToNumbers(x):
''' label encoders creation '''
start = timer()
les = [] # Label EncoderS
numCols = len(x[0])
for c in range(numCols):
# le is a LabelEncoder: it allows to transform in numbers the strings of i-th column, and vice versa
le = preprocessing.LabelEncoder()
le.fit([x[r][c] for r in range(len(x))])
les.append(le)
# after the for loop, in les there is a LabelEncoder for each dataset column
end = timer()
print("LabelEncoders creation time: ", (end - start))
''' effective x transformation '''
start = timer()
# each column of x is transformed and put in xTransp as a row
xTransp = []
for c in range(numCols):
xTransp.append(les[c].transform([row[c] for row in x]))
# xTransp contains numbers, but it's transposed compared to x, so the numbered x is obtained by transposing xTransp
x = list(np.array(xTransp).transpose())
end = timer()
print("x transformation time: ", (end - start))
return x