/
ipython_dataprep_input.py
136 lines (103 loc) · 4.42 KB
/
ipython_dataprep_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Customarily, we import and start H2O as follows:
import h2o
h2o.init() # Will set up H2O cluster using all available cores
h2o.init(ip="123.45.67.89", port=54321)
# To create an H2OFrame object from a python tuple:
df = h2o.H2OFrame(zip(*((1, 2, 3),
('a', 'b', 'c'),
(0.1, 0.2, 0.3))))
df
# To create an H2OFrame object from a python list:
df = h2o.H2OFrame(zip(*[[1, 2, 3],
['a', 'b', 'c'],
[0.1, 0.2, 0.3]]))
df
# To create an H2OFrame object from a python dict (or collections.OrderedDict):
df = h2o.H2OFrame({'A': [1, 2, 3],
'B': ['a', 'b', 'c'],
'C': [0.1, 0.2, 0.3]})
df
# To create an H2OFrame object from a dict with specified column types:
df2 = h2o.H2OFrame.from_python({'A': [1, 2, 3],
'B': ['a', 'a', 'b'],
'C': ['hello', 'all', 'world'],
'D': ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']},
column_types=['numeric', 'enum', 'string', 'time'])
df2
df2.types
import numpy as np
df = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))
df.head()
df.tail(5)
df.columns
df.describe()
df['A']
df[1]
df[['B','C']]
df[0:2]
df[2:7, :]
df2[ df2["B"] == "a", :]
df3 = h2o.H2OFrame.from_python({'A': [1, 2, 3,None,''],
'B': ['a', 'a', 'b', 'NA', 'NA'],
'C': ['hello', 'all', 'world', None, None],
'D': ['12MAR2015:11:00:00',None,'13MAR2015:12:00:00',None,'14MAR2015:13:00:00']},
column_types=['numeric', 'enum', 'string', 'time'])
df3
df3["A"].isna()
df3[ df3["A"].isna(), "A"] = 5
df3
df4 = h2o.H2OFrame.from_python({'A': [1, 2, 3,None,''],
'B': ['a', 'a', 'b', 'NA', 'NA'],
'C': ['hello', 'all', 'world', None, None],
'D': ['12MAR2015:11:00:00',None,'13MAR2015:12:00:00',None,'14MAR2015:13:00:00']},
column_types=['numeric', 'enum', 'string', 'time'])
df4.mean(na_rm=True)
df4["A"].mean() # check if this behaviour or the one above is a bug
df4["A"].mean(na_rm=True)
df5 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))
df5.apply(lambda x: x.mean(na_rm=True))
df5.apply(lambda row: sum(row), axis=1)
df6 = h2o.H2OFrame(np.random.randint(0, 7, size=100).tolist())
df6.hist(plot=False)
df7 = h2o.H2OFrame.from_python(['Hello', 'World', 'Welcome', 'To', 'H2O', 'World'])
df7
df7.countmatches('l')
df7.sub('l','x') #TODO in place, not quite right it seems
df7.strsplit('(l)+')
df8 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))
df9 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))
df8.rbind(df9)
df8.cbind(df9)
df10 = h2o.H2OFrame.from_python( { 'A': ['Hello', 'World', 'Welcome', 'To', 'H2O', 'World'],
'n': [0,1,2,3,4,5]} )
df11 = h2o.H2OFrame.from_python([[x] for x in np.random.randint(0, 10, size=100).tolist()], column_names=['n'])
df11.merge(df10) #TODO appears to be broken
# in grouping section of doc now
df12 = h2o.H2OFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df12
df12.group_by('A').sum().frame
df13 = df12.group_by(['A','B']).sum().frame
df13
df12.merge(df13)
df14 = h2o.H2OFrame.from_python({'D': ['18OCT2015:11:00:00','19OCT2015:12:00:00','20OCT2015:13:00:00']},
column_types=['time'])
df14.types
df14['D'].day()
df14['D'].dayOfWeek()
#Categorical section
df12.types
df12.anyfactor()
df12["A"].levels()
df12.interaction(['A','B'], pairwise=False, max_factors=3, min_occurrence=1)
bb_df = df12.interaction(['B','B'], pairwise=False, max_factors=2, min_occurrence=1)
bb_df
df15 = df12.cbind(bb_df)
df15
#### Saving and loading files section
df = h2o.upload_file("/pathToFile/fileName")
df = h2o.import_file("/pathToFile/fileName")