# Setup

In [4]:
# # Init pyspark
# from pyspark import SparkContext
# sc = SparkContext.getOrCreate()
# # Init sparksql -- Only used to format the output nicely!
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys



inputFile = 'data/test_format1.csv'

conf = SparkConf().setAppName("SparkSQLAirTransit")
SparkConf().set("spark.sql.legacy.timeParserPolicy","LEGACY")

sc = SparkContext.getOrCreate(conf=conf)
hiveCtx = HiveContext(sc)
print("Loading tweets from " + inputFile)


Loading tweets from data/test_format1.csv


In [5]:
input = hiveCtx.read.option("header",True).csv(inputFile,inferSchema =True)
input.printSchema()



root
 |-- user_id: integer (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- prob: string (nullable = true)



In [6]:
# Sample Query

In [7]:

input.registerTempTable("air_transit")


myair_transits = hiveCtx.sql("SELECT * FROM air_transit  LIMIT 3")
print('myair_transits:' )
for item in myair_transits.collect():
    print(item, '\n')

myair_transits:
Row(user_id=163968, merchant_id=4605, prob=None) 

Row(user_id=360576, merchant_id=1581, prob=None) 

Row(user_id=98688, merchant_id=1964, prob=None) 



## Q1
Compute the total number of records.

In [15]:
# Response...
df = input

# mycount = hiveCtx.sql("SELECT count(*) as mycount FROM air_transit")
# print('mycount=', mycount.collect()[0]['mycount'] )

print('total number of user_id-merchant.',df.count() )



total number of user_id-merchant. 7027943




In [10]:
## Q2



# mymonth = hiveCtx.sql("select Month, count(*) as flight_number from air_transit group by Month order by Month LIMIT 100")
# print('total number of operated flights per month:')  
# for item in mymonth.collect():
#     print(item['Month'], 'month', item['flight_number'])
    
from pyspark.sql.functions import col

print('Q2 group user_id and count:')

df1 = df.groupBy("user_id").count().sort('count',ascending=False) 
for item in df1.rdd.collect()[:10]:
    print(item['user_id'], item['count'])



Q2 group user_id and count:


                                                                                

265990 25
246371 13
216131 12
262623 11
50682 11
11501 11
72099 11
147143 11
55305 10
351748 10


In [12]:
inputFile = 'data/big/test_format2.csv'
input = hiveCtx.read.option("header",True).csv(inputFile,inferSchema =True)
input.printSchema()




root
 |-- user_id: integer (nullable = true)
 |-- age_range: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- activity_log: string (nullable = true)



                                                                                

In [16]:
# Response...
df = input

# mycount = hiveCtx.sql("SELECT count(*) as mycount FROM air_transit")
# print('mycount=', mycount.collect()[0]['mycount'] )

print('total number of activity_log ',df.count() )



total number of activity_log  7027943


                                                                                

In [13]:
input.registerTempTable("air_transit")


myair_transits = hiveCtx.sql("SELECT * FROM air_transit  LIMIT 3")
print('myair_transits:' )
for item in myair_transits.collect():
    print(item, '\n')

myair_transits:
Row(user_id=163968, age_range=0, gender=0, merchant_id=4378, label=-1, activity_log='101206:812:6968:0614:0') 

Row(user_id=163968, age_range=0, gender=0, merchant_id=2300, label=-1, activity_log='588758:844:3833:0618:0#71782:844:3833:1111:2#71782:844:3833:1111:0#71782:844:3833:1111:0#71782:844:3833:1102:0#702201:844:3833:1102:0#71782:844:3833:1102:0#1009809:844:3833:1102:0#71782:844:3833:1110:0#71782:844:3833:1110:0#588758:844:3833:0618:2') 

Row(user_id=163968, age_range=0, gender=0, merchant_id=1551, label=-1, activity_log='312747:243:1954:0627:0#312747:243:1954:0627:0#312747:243:1954:0627:0#312747:243:1954:0627:2#312747:243:1954:0627:0') 



In [14]:
mymonth = hiveCtx.sql("select age_range, count(*) as mycount from air_transit group by age_range order by age_range LIMIT 100")
 
for item in mymonth.collect():
    print(item['age_range'], 'age_range', item['mycount'])
    



None age_range 19420
0 age_range 1345565
1 age_range 260
2 age_range 733323
3 age_range 1916611
4 age_range 1460542
5 age_range 752608
6 age_range 650358
7 age_range 128644
8 age_range 20612


                                                                                

In [17]:
# Find the plane with the highest number of flights. Each plane has a unique TailNum
mymonth = hiveCtx.sql("select gender, count(*) as mycount from air_transit group by gender order by gender LIMIT 100")
 
for item in mymonth.collect():
    print(item['gender'], 'gender', item['mycount'])
    



None gender 63250
0 gender 5062667
1 gender 1643382
2 gender 258644


                                                                                

In [18]:
mymonth = hiveCtx.sql("select label, count(*) as mycount from air_transit group by label order by label LIMIT 100")
print('label 1 is 重复买家  ，label 0 是非重复买家')
for item in mymonth.collect():
    print(item['label'], 'label', item['mycount'])
    

label 1 is 重复买家  ，label 0 是非重复买家




None label 261477
-1 label 6766466


                                                                                

In [19]:
print('part 3')
inputFile = 'data/big/user_log_format1.csv'
input = hiveCtx.read.option("header",True).csv(inputFile,inferSchema =True)
input.printSchema()

part 3




root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- cat_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)
 |-- brand_id: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)
 |-- action_type: integer (nullable = true)



                                                                                

In [20]:
# Response...
df = input

# mycount = hiveCtx.sql("SELECT count(*) as mycount FROM air_transit")
# print('mycount=', mycount.collect()[0]['mycount'] )

print('total number of activity_log ',df.count() )



total number of activity_log  54925330




In [21]:
input.registerTempTable("air_transit")


myair_transits = hiveCtx.sql("SELECT * FROM air_transit  LIMIT 3")
print('myair_transits:' )
for item in myair_transits.collect():
    print(item, '\n')

myair_transits:
Row(user_id=328862, item_id=323294, cat_id=833, seller_id=2882, brand_id=2661, time_stamp=829, action_type=0) 

Row(user_id=328862, item_id=844400, cat_id=1271, seller_id=2882, brand_id=2661, time_stamp=829, action_type=0) 

Row(user_id=328862, item_id=575153, cat_id=1271, seller_id=2882, brand_id=2661, time_stamp=829, action_type=0) 



In [24]:
mymonth = hiveCtx.sql("select brand_id, count(*) as mycount from air_transit group by brand_id order by mycount desc LIMIT 10")
 
for item in mymonth.collect():
    print(item['brand_id'], 'brand_id', item['mycount'])
    



3738 brand_id 763345
1360 brand_id 737545
1446 brand_id 729555
1214 brand_id 541075
5376 brand_id 528003
82 brand_id 503911
2276 brand_id 491738
8235 brand_id 400024
4705 brand_id 363417
1662 brand_id 332633




In [25]:
mymonth = hiveCtx.sql("select action_type, count(*) as mycount from air_transit group by action_type order by mycount desc LIMIT 10")
 
for item in mymonth.collect():
    print(item['action_type'], 'action_type', item['mycount'])
    



0 action_type 48550713
2 action_type 3292144
3 action_type 3005723
1 action_type 76750


                                                                                