In [1]:
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession\
    .builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

# 데이터 프레임 생성

## DF의 생성방법
- 내부: spark.createDataFrame() // python list or dict 또는 pyspa가.sql.Row
- 외부: spark.createDataFrame()또는 spark.read() // RDD나 JSON(csv)

## DF schema
- 정해주기 : schema 정해서 DF 생성
- 안정해주기 : spark가 자동으로 유추하여 임의로 결정

## DF API
- spark.read.json('employee.json')
- df.show()
- df.columns()
- df.printSchema()
- df.filter(dfs("age")>23).show()
- df.groupBy("age").count().show()
- df.dropna()/df.na.drop()
- df.count() #행세기
- df.drop("name")

## schema
- columns 정해서 spark.createDataFrame(list,col)
- **row 객체를 사용하여 생성**

In [4]:
myList=[('1','kim, js', 170),
        ('1','lee, sm', 175),
        ('2','lim, yg',180),
        ('2','lee', 170)]
myDf=spark.createDataFrame(myList)

In [5]:
myDf.columns

['_1', '_2', '_3']

In [6]:
myDf

DataFrame[_1: string, _2: string, _3: bigint]

In [7]:
myDf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [8]:
print(myDf.take(1))

[Row(_1='1', _2='kim, js', _3=170)]


### 컬럼명 설정

In [9]:
cols = ['year','name','height']
_myDf = spark.createDataFrame(myList, cols)

In [10]:
_myDf.columns

['year', 'name', 'height']

In [11]:
_myDf.columns

['year', 'name', 'height']

In [12]:
names = ["kim","lee","lee","lim"]
#4개의 문자열을 list로
items = ["espresso","latte","americano","affocato","long black","macciato"]
coffeeDf = spark.createDataFrame([(names[i%4], items[i%6]) for i in range(100)],\
                                  ["name","coffee"])

In [13]:
coffeeDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- coffee: string (nullable = true)



In [15]:
coffeeDf.show(10)

+----+----------+
|name|    coffee|
+----+----------+
| kim|  espresso|
| lee|     latte|
| lee| americano|
| lim|  affocato|
| kim|long black|
| lee|  macciato|
| lee|  espresso|
| lim|     latte|
| kim| americano|
| lee|  affocato|
+----+----------+
only showing top 10 rows



## ROW 객체를 사용하여 생성

In [16]:
from pyspark.sql import Row
Person = Row('year','name', 'height') #column 명 ; 행 명을 Person 이라고
row1=Person('1','kim, js',170) 

In [23]:
print ("row1: ", row1.year, row1.name, row1.height)

row1:  1 kim, js 170


### row를 Dictionary로 저장
#### row는 속성명과 값을 가지고 있기 때문에 dictionary 값으로 쉽게 변환가능

In [18]:
row1.asDict()

{'year': '1', 'name': 'kim, js', 'height': 170}

In [20]:
row1.asDict().keys() #.values() 가능

dict_keys(['year', 'name', 'height'])

In [21]:
Person = Row('year','name', 'height')
row1=Person('1','kim, js',170) 
myRows = [row1,
          Person('1','lee, sm', 175),
          Person('2','lim, yg',180),
          Person('2','lee',170)]
print(type(myRows))

<class 'list'>


In [24]:
myDf=spark.createDataFrame(myRows)

In [25]:
print (myDf.printSchema())
myDf.show()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)

None
+----+-------+------+
|year|   name|height|
+----+-------+------+
|   1|kim, js|   170|
|   1|lee, sm|   175|
|   2|lim, yg|   180|
|   2|    lee|   170|
+----+-------+------+



-------------------------------------------------------------------
# schema 정의하고 생성

- 모델 schema 를 정하고, 데이터 타잎을 정의해 DataFrame을 생성해본다
- StructType으로 구조체를 선언하고, 컬럼에 대해 StructField를 설정한다
<br><br>
* **컬럼**의 명칭
* 앞서 소개했던 **데이터 타잎**
* 마지막은 **NULL**이 허용되는지 여부
<br><br>

```python
StructType([
    StructField(컬럼명, StringType(), True),
    ...
])
```

In [26]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType
mySchema=StructType([
    StructField("year", StringType(), True),
    StructField("name", StringType(), True),
    StructField("height", IntegerType(), True)
])
print(type(mySchema))

<class 'pyspark.sql.types.StructType'>


In [27]:
myDf=spark.createDataFrame(myRows, mySchema)
#myRows 를 데이터로, mySchema에서는 컬럼명과 데이터 타잎을 정의하여 createDataFrame() 함수의 인자로 넘겨주고 있음

In [28]:
myDf.printSchema()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: integer (nullable = true)



In [29]:
myDf.take(1)

[Row(year='1', name='kim, js', height=170)]

##  RDD에서 생성하기
RDD는 schema가 정해지지 않은 비구조적 데이터이다. 이와 같이 schema를 정의하지 않으면, Spark는 schema를 유추하게 된다.
- 자동인식
- Row 이용하여 Schema 
- Schema 직접 정의하고 생성

In [31]:
myList=[('1','kim, js',170), ('1','lee, sm', 175), 
        ('2','lim, yg',180), ('2','lee',170)]
#rdd 로 만듦
myRdd = spark.sparkContext.parallelize(myList)

In [None]:
# rdd - > dataframe
# 1. toDF()
# 2. 직접 createDataFrame()함수를 사용

In [34]:
# 1.
rddDf=myRdd.toDF()

In [35]:
rddDf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [36]:
# 2.
rddDf=spark.createDataFrame(myRdd)

In [37]:
rddDf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



## row를 이용하여 변환
학년 year는 앞에서 string 으로 인식되어있다. 이번예제에서는 형변환을 해본다. <br>
RDD 의 map() 함수를 사용하여 각 속성을 읽고 init() 함수로 형변환을 한다.
<br>
각 속성에 명칭, year. name, height를 설정한다.

In [38]:
from pyspark.sql import Row
#바꿀라면 map함수 사용
_myRdd=myRdd.map(lambda x:Row(year=int(x[0]), name=x[1], height=int(x[2])))

'''원래 row 사용할때
Person = Row('year','name', 'height')
'''
print(type(_myRdd))

<class 'pyspark.rdd.PipelinedRDD'>


In [39]:
_myDf=spark.createDataFrame(_myRdd)

_myDf.printSchema()

root
 |-- year: long (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)



In [40]:
_myDf.take(1)

[Row(year=1, name='kim, js', height=170)]

## Row() 사용하여 RDD 생성

In [41]:
from pyspark.sql import Row

r1=Row(name="js1", age=10)
r2=Row(name="js2", age=20)
_myRdd=spark.sparkContext.parallelize([r1,r2])

_myRdd.collect()

[Row(name='js1', age=10), Row(name='js2', age=20)]

## schema를 정의하고 생성
#### schema 정의하고 RDD 에서 DataFrame 을 생성할 수 있음
#### StructType을 선언하고, 컬럼에 대해 StructField를 컬럼명, 데이터 타잎, NULL 이 허용되는지 여부를 설정
#### 컬럼명을 정렬하지 않으므로, 순서대로 아래와 같이 생성하면 됨

In [42]:
#schema를 정해서 RDD 로부터 DataFrame 을 생성
from pyspark.sql.types import *

myRdd=spark.sparkContext.parallelize([(1, 'kim', 50.0), (2, 'lee', 60.0), (3, 'park', 70.0)])

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("height", DoubleType(), True)
])
_myDf = spark.createDataFrame(myRdd, schema)

In [43]:
_myDf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- height: double (nullable = true)



In [44]:
_myDf.show()

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
|  3|park|  70.0|
+---+----+------+



## Pandas

#### spark 는 분산처리 할때, 데이터 크기가 클때 사용.
#### pandas 는 데이터 크기가 작을때

- csv 파일 읽기 : DataFrame : read.json()
                 pandas : read_csv()
- 데이터 타입 : DataFrame : inferschema = True 설정하면 추정
                 pandas : 모두 strings

In [45]:
#Spark Dataframe을 toPandas() 함수를 사용하여 Pandas로 변환할 수 있다

myDf.toPandas()

Unnamed: 0,year,name,height
0,1,"kim, js",170
1,1,"lee, sm",175
2,2,"lim, yg",180
3,2,lee,170


In [46]:
#판다스에서 csv 쓰기
import os
myDf.write.format('com.databricks.spark.csv').save(os.path.join('data','_myDf.csv'))

In [47]:
!dir data\_myDf.csv\

 C 드라이브의 볼륨: Windows-SSD
 볼륨 일련 번호: C6A4-826F

 C:\Users\woojung\Desktop\AI_WORKS\02. SelfStudy\2. DL\BigData_실습\data\_myDf.csv 디렉터리

2020-10-19  오후 11:33    <DIR>          .
2020-10-19  오후 11:33    <DIR>          ..
2020-10-19  오후 11:33                12 .part-00000-fa85dcd7-2fe0-4a25-9568-dee2d94c764f-c000.csv.crc
2020-10-19  오후 11:33                 8 ._SUCCESS.crc
2020-10-19  오후 11:33                62 part-00000-fa85dcd7-2fe0-4a25-9568-dee2d94c764f-c000.csv
2020-10-19  오후 11:33                 0 _SUCCESS
               4개 파일                  82 바이트
               2개 디렉터리  53,928,091,648 바이트 남음


In [48]:
myDf.toPandas().to_csv(os.path.join('data','myDf.csv'))

In [49]:
# json 형식에서 pandas 만드는것
import pandas as pd
icc = pd.DataFrame( { 'country': ['South Korea','Japan','Hong Kong'],'codes': [81, 82, 852] })

In [50]:
icc

Unnamed: 0,country,codes
0,South Korea,81
1,Japan,82
2,Hong Kong,852


In [51]:
icc[icc['codes']==81]

Unnamed: 0,country,codes
0,South Korea,81


# A. csv 파일
- 1) RDD 에서 csv 파일 읽고 DataFrame
- 2) DataFrame으로 csv 파일 직접읽기

# B. tsv 파일 
- 1) RDD로 읽기 
- 2) DataFrame 읽기 
- split

# C. json 파일
- 1) pandas에서 읽기 
- 2) dataframe에서 읽기

In [52]:
# A. csv 파일
# 1) RDD로 csv 파일 읽기

In [55]:
from pyspark.sql import Row
cfile= os.path.join("s-master/data", "ds_spark_2cols.csv") #csv 파일 읽기 <class 'str'>

lines = spark.sparkContext.textFile(cfile) #lines 가 RDD 가되는것임 <class 'RDD'>
print(lines.collect())

['35, 2', '40, 27', '12, 38', '15, 31', '21, 1', '14, 19', '46, 1', '10, 34', '28, 3', '48, 1', '16, 2', '30, 3', '32, 2', '48, 1', '31, 2', '22, 1', '12, 3', '39, 29', '19, 37']


In [56]:
_col12 = lines.map(lambda l: l.split(","))
print(_col12.collect())

[['35', ' 2'], ['40', ' 27'], ['12', ' 38'], ['15', ' 31'], ['21', ' 1'], ['14', ' 19'], ['46', ' 1'], ['10', ' 34'], ['28', ' 3'], ['48', ' 1'], ['16', ' 2'], ['30', ' 3'], ['32', ' 2'], ['48', ' 1'], ['31', ' 2'], ['22', ' 1'], ['12', ' 3'], ['39', ' 29'], ['19', ' 37']]


In [57]:
col12 = _col12.map(lambda p: Row(col1=int(p[0].strip()), col2=int(p[1].strip())))
#strip() : , 빼줘야함 앞뒤 whitespace 빼줘야함
print(col12.collect())
_myDf = spark.createDataFrame(col12)

[Row(col1=35, col2=2), Row(col1=40, col2=27), Row(col1=12, col2=38), Row(col1=15, col2=31), Row(col1=21, col2=1), Row(col1=14, col2=19), Row(col1=46, col2=1), Row(col1=10, col2=34), Row(col1=28, col2=3), Row(col1=48, col2=1), Row(col1=16, col2=2), Row(col1=30, col2=3), Row(col1=32, col2=2), Row(col1=48, col2=1), Row(col1=31, col2=2), Row(col1=22, col2=1), Row(col1=12, col2=3), Row(col1=39, col2=29), Row(col1=19, col2=37)]


In [58]:
_myDf.printSchema()
#_myDf.collect()

root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)



In [59]:
%%writefile data/ds_spark.csv
1,2,3,4
11,22,33,44
111,222,333,444

Writing data/ds_spark.csv


In [60]:
#formatload
## format("csv").load("path")라고하고, options()설정을 미리 넣을 수 있다.
df = spark\
        .read\
        .format('com.databricks.spark.csv')\
        .options(header='true', inferschema='true', delimiter=',')\
        .load(os.path.join('data','ds_spark.csv'))

In [61]:
df.show()

+---+---+---+---+
|  1|  2|  3|  4|
+---+---+---+---+
| 11| 22| 33| 44|
|111|222|333|444|
+---+---+---+---+



In [62]:
df.printSchema()

root
 |-- 1: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- 3: integer (nullable = true)
 |-- 4: integer (nullable = true)



In [63]:
#inferschema를 제외하면, string 으로 자동인식됨
df = spark\
        .read\
        .format('com.databricks.spark.csv')\
        .options(header='true', delimiter=',')\
        .load(os.path.join('data','ds_spark.csv'))

In [64]:
df.printSchema()

root
 |-- 1: string (nullable = true)
 |-- 2: string (nullable = true)
 |-- 3: string (nullable = true)
 |-- 4: string (nullable = true)



In [65]:
##csv 파일
df = spark\
        .read\
        .options(header='true', inferschema='true', delimiter=',')\
        .csv(os.path.join('data', 'ds_spark.csv'))
df.show()

+---+---+---+---+
|  1|  2|  3|  4|
+---+---+---+---+
| 11| 22| 33| 44|
|111|222|333|444|
+---+---+---+---+



In [66]:
##tsv 파일

##TAB은 whitespace이므로 그냥 split()을 해도 된다.
import numpy as np
np.array([float(x) for x in '1.658985	4.285136'.split()])

array([1.658985, 4.285136])

In [67]:
%%writefile data/ds_spark_heightweight.txt
1	65.78	112.99
2	71.52	136.49
3	69.40	153.03
4	68.22	142.34
5	67.79	144.30
6	68.70	123.30
7	69.80	141.49
8	70.01	136.46
9	67.90	112.37
10	66.78	120.67
11	66.49	127.45
12	67.62	114.14
13	68.30	125.61
14	67.12	122.46
15	68.28	116.09
16	71.09	140.00
17	66.46	129.50
18	68.65	142.97
19	71.23	137.90
20	67.13	124.04
21	67.83	141.28
22	68.88	143.54
23	63.48	97.90
24	68.42	129.50
25	67.63	141.85
26	67.21	129.72
27	70.84	142.42
28	67.49	131.55
29	66.53	108.33
30	65.44	113.89
31	69.52	103.30
32	65.81	120.75
33	67.82	125.79
34	70.60	136.22
35	71.80	140.10
36	69.21	128.75
37	66.80	141.80
38	67.66	121.23
39	67.81	131.35
40	64.05	106.71
41	68.57	124.36
42	65.18	124.86
43	69.66	139.67
44	67.97	137.37
45	65.98	106.45
46	68.67	128.76
47	66.88	145.68
48	67.70	116.82
49	69.82	143.62
50	69.09	134.93

Writing data/ds_spark_heightweight.txt


In [68]:
from pyspark.sql.types import *
_tRdd=spark.sparkContext\
    .textFile(os.path.join('data','ds_spark_heightweight.txt'))

In [69]:
#tRdd=rdd.map(lambda x:x.split('\t'))
_tRddSplitted = _tRdd.map(lambda x:x.split())

In [70]:
#import numpy as np
#myRdd=rdd.map(lambda line:np.array([float(x) for x in line.split('\t')]))
tRdd=_tRdd.map(lambda line:[float(x) for x in line.split('\t')])
tRdd.take(1)

[[1.0, 65.78, 112.99]]

In [71]:
#DataFrame
tDfNamed = spark.createDataFrame(tRdd, ["id","weight","height"])
tDfNamed.printSchema()

root
 |-- id: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- height: double (nullable = true)



In [72]:
tDfNamed.take(1)

[Row(id=1.0, weight=65.78, height=112.99)]


pyspark.sql.functions은 함수이므로, import할 경우

- import pyspark.sql.functions.split 이렇게 하지 않고,
- from pyspark.sql.functions import split 이렇게 한다.

In [73]:
tDftxt = spark.read.text(os.path.join('data','ds_spark_heightweight.txt'))

In [74]:
tDftxt.printSchema()

root
 |-- value: string (nullable = true)



In [75]:
from pyspark.sql.functions import split

split_col = split(tDftxt['value'], '\t')

In [None]:
분리된 컬럼은 getItem() 함수로 가져와서 각 각 weight, height 컬럼이 된다.

In [76]:
split_col.getItem(1)

Column<b'split(value, \t, -1)[1]'>

In [77]:
tDftxt = tDftxt.withColumn('weight', split_col.getItem(1))
tDftxt = tDftxt.withColumn('height', split_col.getItem(2))

In [78]:
tDftxt.show()

+---------------+------+------+
|          value|weight|height|
+---------------+------+------+
| 1	65.78	112.99| 65.78|112.99|
| 2	71.52	136.49| 71.52|136.49|
| 3	69.40	153.03| 69.40|153.03|
| 4	68.22	142.34| 68.22|142.34|
| 5	67.79	144.30| 67.79|144.30|
| 6	68.70	123.30| 68.70|123.30|
| 7	69.80	141.49| 69.80|141.49|
| 8	70.01	136.46| 70.01|136.46|
| 9	67.90	112.37| 67.90|112.37|
|10	66.78	120.67| 66.78|120.67|
|11	66.49	127.45| 66.49|127.45|
|12	67.62	114.14| 67.62|114.14|
|13	68.30	125.61| 68.30|125.61|
|14	67.12	122.46| 67.12|122.46|
|15	68.28	116.09| 68.28|116.09|
|16	71.09	140.00| 71.09|140.00|
|17	66.46	129.50| 66.46|129.50|
|18	68.65	142.97| 68.65|142.97|
|19	71.23	137.90| 71.23|137.90|
|20	67.13	124.04| 67.13|124.04|
+---------------+------+------+
only showing top 20 rows



In [79]:
#csv 함수로 tsv 읽기
## 앞서 복잡한거 없이 '\t' 대신 이거 필요 이렇게 잘 쓰임
tDf = spark\
    .read\
    .options(header='false', inferschema='true', delimiter='\t')\
    .csv(os.path.join('data', 'ds_spark_heightweight.txt'))
tDf.show()

+---+-----+------+
|_c0|  _c1|   _c2|
+---+-----+------+
|  1|65.78|112.99|
|  2|71.52|136.49|
|  3| 69.4|153.03|
|  4|68.22|142.34|
|  5|67.79| 144.3|
|  6| 68.7| 123.3|
|  7| 69.8|141.49|
|  8|70.01|136.46|
|  9| 67.9|112.37|
| 10|66.78|120.67|
| 11|66.49|127.45|
| 12|67.62|114.14|
| 13| 68.3|125.61|
| 14|67.12|122.46|
| 15|68.28|116.09|
| 16|71.09| 140.0|
| 17|66.46| 129.5|
| 18|68.65|142.97|
| 19|71.23| 137.9|
| 20|67.13|124.04|
+---+-----+------+
only showing top 20 rows



# Json
```python
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
```

In [82]:
#%%writefile src/ds_twitter_seoul_3.json
#{"contributors": null, "truncated": false, "text": "RT @soompi: #SEVENTEEN’s Mingyu, Jin Se Yeon, And Leeteuk To MC For 2016 Super Seoul Dream Concert \nhttps://t.co/1XRSaRBbE0 https://t.co/fi…", "is_quote_status": false, "in_reply_to_status_id": null, "id": 801657325836763136, "favorite_count": 0, "entities": {"symbols": [], "user_mentions": [{"id": 17659206, "indices": [3, 10], "id_str": "17659206", "screen_name": "soompi", "name": "Soompi"}], "hashtags": [{"indices": [12, 22], "text": "SEVENTEEN"}], "urls": [{"url": "https://t.co/1XRSaRBbE0", "indices": [100, 123], "expanded_url": "http://www.soompi.com/2016/11/20/seventeens-mingyu-jin-se-yeon-leeteuk-mc-dream-concert/", "display_url": "soompi.com/2016/11/20/sev…"}]}, "retweeted": false, "coordinates": null, "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>", "in_reply_to_screen_name": null, "in_reply_to_user_id": null, "retweet_count": 1487, "id_str": "801657325836763136", "favorited": false, "retweeted_status": {"contributors": null, "truncated": false, "text": "#SEVENTEEN’s Mingyu, Jin Se Yeon, And Leeteuk To MC For 2016 Super Seoul Dream Concert \nhttps://t.co/1XRSaRBbE0 https://t.co/fifXHpF8or", "is_quote_status": false, "in_reply_to_status_id": null, "id": 800593781586132993, "favorite_count": 1649, "entities": {"symbols": [], "user_mentions": [], "hashtags": [{"indices": [0, 10], "text": "SEVENTEEN"}], "urls": [{"url": "https://t.co/1XRSaRBbE0", "indices": [88, 111], "expanded_url": "http://www.soompi.com/2016/11/20/seventeens-mingyu-jin-se-yeon-leeteuk-mc-dream-concert/", "display_url": "soompi.com/2016/11/20/sev…"}], "media": [{"expanded_url": "https://twitter.com/soompi/status/800593781586132993/photo/1", "display_url": "pic.twitter.com/fifXHpF8or", "url": "https://t.co/fifXHpF8or", "media_url_https": "https://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg", "id_str": "800593115165798400", "sizes": {"small": {"h": 382, "resize": "fit", "w": 680}, "large": {"h": 449, "resize": "fit", "w": 800}, "medium": {"h": 449, "resize": "fit", "w": 800}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [112, 135], "type": "photo", "id": 800593115165798400, "media_url": "http://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg"}]}, "retweeted": false, "coordinates": null, "source": "<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>", "in_reply_to_screen_name": null, "in_reply_to_user_id": null, "retweet_count": 1487, "id_str": "800593781586132993", "favorited": false, "user": {"follow_request_sent": false, "has_extended_profile": true, "profile_use_background_image": true, "default_profile_image": false, "id": 17659206, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/699864769/1cdde0a85f5c0a994ae1fb06d545a5ec.png", "verified": true, "translator_type": "none", "profile_text_color": "999999", "profile_image_url_https": "https://pbs.twimg.com/profile_images/792117259489583104/4khJk3zz_normal.jpg", "profile_sidebar_fill_color": "000000", "entities": {"url": {"urls": [{"url": "http://t.co/3evT80UlR9", "indices": [0, 22], "expanded_url": "http://www.soompi.com", "display_url": "soompi.com"}]}, "description": {"urls": []}}, "followers_count": 987867, "profile_sidebar_border_color": "000000", "id_str": "17659206", "profile_background_color": "1E1E1E", "listed_count": 3982, "is_translation_enabled": true, "utc_offset": -28800, "statuses_count": 80038, "description": "The original K-pop community. We take gifs, OTPs, and reporting on your bias' fashion choices seriously. But not rumors. Ain't nobody got time for that.", "friends_count": 3532, "location": "Worldwide", "profile_link_color": "31B6F4", "profile_image_url": "http://pbs.twimg.com/profile_images/792117259489583104/4khJk3zz_normal.jpg", "following": false, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/17659206/1478803767", "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/699864769/1cdde0a85f5c0a994ae1fb06d545a5ec.png", "screen_name": "soompi", "lang": "en", "profile_background_tile": true, "favourites_count": 1493, "name": "Soompi", "notifications": false, "url": "http://t.co/3evT80UlR9", "created_at": "Wed Nov 26 20:48:27 +0000 2008", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "created_at": "Mon Nov 21 06:56:46 +0000 2016", "in_reply_to_status_id_str": null, "place": null, "extended_entities": {"media": [{"expanded_url": "https://twitter.com/soompi/status/800593781586132993/photo/1", "display_url": "pic.twitter.com/fifXHpF8or", "url": "https://t.co/fifXHpF8or", "media_url_https": "https://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg", "id_str": "800593115165798400", "sizes": {"small": {"h": 382, "resize": "fit", "w": 680}, "large": {"h": 449, "resize": "fit", "w": 800}, "medium": {"h": 449, "resize": "fit", "w": 800}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [112, 135], "type": "photo", "id": 800593115165798400, "media_url": "http://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg"}]}, "metadata": {"iso_language_code": "en", "result_type": "recent"}}, "user": {"follow_request_sent": false, "has_extended_profile": false, "profile_use_background_image": true, "default_profile_image": true, "id": 791090169818521600, "profile_background_image_url_https": null, "verified": false, "translator_type": "none", "profile_text_color": "333333", "profile_image_url_https": "https://abs.twimg.com/sticky/default_profile_images/default_profile_6_normal.png", "profile_sidebar_fill_color": "DDEEF6", "entities": {"description": {"urls": []}}, "followers_count": 0, "profile_sidebar_border_color": "C0DEED", "id_str": "791090169818521600", "profile_background_color": "F5F8FA", "listed_count": 0, "is_translation_enabled": false, "utc_offset": null, "statuses_count": 96, "description": "", "friends_count": 7, "location": "", "profile_link_color": "1DA1F2", "profile_image_url": "http://abs.twimg.com/sticky/default_profile_images/default_profile_6_normal.png", "following": false, "geo_enabled": false, "profile_background_image_url": null, "screen_name": "enriquesanq", "lang": "es", "profile_background_tile": false, "favourites_count": 161, "name": "Enrique santos", "notifications": false, "url": null, "created_at": "Wed Oct 26 01:32:49 +0000 2016", "contributors_enabled": false, "time_zone": null, "protected": false, "default_profile": true, "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "created_at": "Thu Nov 24 05:22:55 +0000 2016", "in_reply_to_status_id_str": null, "place": null, "metadata": {"iso_language_code": "en", "result_type": "recent"}

In [84]:
# json 파일은 Json 이 아니라 문자열임
# 파일에서 읽은 후 json 으로 변형을 해주어야함
#파싱을 해줘야함
import os
_jfname=os.path.join('s-master/src','ds_twitter_seoul_3.json')
with open(_jfname, 'rb') as f:
    data = f.readlines()

In [85]:
#파싱 #json.loads(string의 s)
import json
data_json_str = json.loads(data[0])

In [87]:
jfile= os.path.join('s-master/src','ds_twitter_seoul_3.json')

tweetDf= spark.read.json(jfile)

In [88]:
tweetDf.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- symbols: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- urls: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- url: string (nullable = true)
 |    |-- user_mentions: array (nullable = true)
 |    |    |-- element: struct (containsNull 

#### tsv RDD로 data load 하기

In [89]:
#tsv의 RDD읽어보기
from pyspark.sql.types import *

_tRdd=spark.sparkContext\
    .textFile(os.path.join('s-master/data','ds_spark_heightweight.txt'))

tab 별 split | schema 설정 안해주면 string 으로 읽힘 <br>
schema 설정한다고해도 string  -> integer, double 로 형변환은 이루어지지 않는다.<br>
string 의 형변환을 명시적으로 해줘야한다.

In [90]:
#.map(lambda x:x.split('\t'))
_tRddSplitted = _tRdd.map(lambda x:x.split())
_tRddSplitted.take(1)

[['1', '65.78', '112.99']]