-
Notifications
You must be signed in to change notification settings - Fork 0
/
AutomateHive.py
106 lines (84 loc) · 3.53 KB
/
AutomateHive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import numpy as np
import pandas as pd
import sys
#
# davej23 03/03/21
#
#
# Outputs how to use program
#
def usage():
print('Usage (e.g. python AutomateHive.py --input text.csv --dbname firstdb --tablename hellotable)\n', '--input -> Flag to input csv to new table\n', '--dbname -> Name of database table goes in\n', '--tablename -> Specify name of table\n')
#
# Generates column names for table
#
def colNameExtractor(file):
file = pd.read_csv(file) # read csv with headers, remove index column
column_names = file.columns # extract column names
column_types = [type(file[n].values[1]) for n in column_names] # find data type for first entry in each column
for i in range(len(column_types)): # for each column type, change to string, int or double where required
if column_types[i] == str: # if string and element doesnt have date, set as str
if '2011-05-01' not in file[column_names[i]].values[1]:
column_types[i] = 'string'
else:
column_types[i] = 'timestamp'
elif column_types[i] in [int, np.int, np.int32, np.int64]:
column_types[i] = 'int'
elif column_types[i] in [float, np.float, np.float32, np.float64]:
column_types[i] = 'double'
return column_names, column_types
#
# Generates command for hive
#
def generator(datafile, table_name, db_name):
colnames, coltypes = colNameExtractor(datafile) # find column names and data types
create_table = 'CREATE TABLE IF NOT EXISTS {} '.format(table_name) # beginning string for creating hive table
features = [] # hold entries for table
for i in range(len(colnames)): # append to features the column names and types with commas and brackets when needed
if i == 0:
features.append('({} {}, '.format(colnames[i], coltypes[i]))
if 0 < i < len(colnames)-1:
features.append('{} {}, '.format(colnames[i], coltypes[i]))
elif i == len(colnames)-1:
features.append('{} {}) '.format(colnames[i], coltypes[i]))
table_style = ''.join(features) # join to a string (e.g. (id INT, age INT, gender String))
other_params = "ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY {} TBLPROPERTIES('skip.header.line.count'='1');".format(repr('\n')) # rest of command string
final_command = create_table + table_style + other_params # compose command
return final_command
#
# Function initial call, checks sys args
#
if len(sys.argv) == 1 or len(sys.argv) > 7 or len(sys.argv) == 6: # if not correct number of arguments, print usage
usage()
else: # if correct format, set appropriate names
if sys.argv[1] == '--input' and sys.argv[3] == '--dbname' and sys.argv[5] == '--tablename':
table_name = sys.argv[6]
db_name = sys.argv[4]
datafile = sys.argv[2]
else: # if anything incorrect, print usage
usage()
#
# Start composing commands as strings
#
hive_command = 'hive -e'
enter_database = 'USE {}'.format(db_name)
#
# Debugging -- if needed
#
#generator([['index','filename','age'],[1,2.2,3]], 'hello', 'hello')
#colNameExtractor([['index','filename','age'],[1,2.2,3]])
#a = [['index','filename','age'],[1,2.2,3]]
#
# Create final Hive command
#
final_command = hive_command + ' ' + '"{}; {} '.format(enter_database,generator(datafile, table_name, db_name)) + "LOAD DATA LOCAL INPATH '{}' INTO TABLE {};".format(datafile,table_name) + '"'
#
# Print final statement if needed
#
# print(final_command)
#
# Execute command in shell
#
os.system(final_command)
#print(final_command)