In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mushroom').master('local[1]').getOrCreate()

### 导入数据并确定数据类型

In [10]:
df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')
len(df0.columns)

23

In [14]:
df0.select('cap-shape').distinct().show()

+---------+
|cap-shape|
+---------+
|        x|
|        f|
|        k|
|        c|
|        b|
|        s|
+---------+



In [5]:
label = df0.rdd.map(lambda row: row[0])
row = df0.rdd.map(lambda row: row[1:])

In [6]:
dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','row'])

In [7]:
dfi.first()

Row(label=0.0, row=['b', 'y', 'y', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 's', 'm'])

In [15]:
# from pyspark.ml.feature import VectorAssembler
# vecAss = VectorAssembler(inputCols=df0.columns[1:], outputCol='feature')
# df0 = vecAss.transform(df0)

In [16]:
from pyspark.ml.feature import CountVectorizer
import numpy as np
from numpy import allclose
cv = CountVectorizer(inputCol='row', outputCol='vectors')
model = cv.fit(dfi)
tf = model.transform(dfi)

In [17]:
tf.take(1)

[Row(label=0.0, row=['x', 's', 'n', 't', 'p', 'f', 'c', 'n', 'k', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u'], vectors=SparseVector(24, {0: 3.0, 1: 1.0, 2: 3.0, 3: 4.0, 4: 2.0, 6: 2.0, 7: 1.0, 8: 2.0, 9: 1.0, 10: 1.0, 15: 1.0, 20: 1.0}))]

In [19]:
(train_data, test_data) = tf.randomSplit([0.8, 0.2])

In [20]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=40, maxDepth=20, labelCol="label", featuresCol='vectors')
model = rf.fit(train_data)
model.featureImportances

SparseVector(24, {0: 0.0532, 1: 0.0375, 2: 0.0577, 3: 0.0947, 4: 0.064, 5: 0.0519, 6: 0.0436, 7: 0.022, 8: 0.0487, 9: 0.0411, 10: 0.0427, 11: 0.0299, 12: 0.0552, 13: 0.0683, 14: 0.0247, 15: 0.0164, 16: 0.0247, 17: 0.072, 18: 0.0844, 19: 0.0326, 20: 0.0135, 21: 0.0045, 22: 0.0132, 23: 0.0033})

In [32]:
result = model.transform(test_data)

In [43]:
result.select('prediction').show(5)

+----------+
|prediction|
+----------+
|       0.0|
|       0.0|
|       1.0|
|       1.0|
|       1.0|
+----------+
only showing top 5 rows



In [34]:
result.show(3)

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|                 row|             vectors|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  0.0|[b, e, e, ?, s, s...|(24,[0,1,3,5,6,7,...|[28.4161036920659...|[0.71040259230164...|       0.0|
|  0.0|[b, f, y, f, f, f...|(24,[0,1,2,5,6,7,...|[37.1750915750915...|[0.92937728937728...|       0.0|
|  0.0|[b, n, w, f, n, f...|(24,[0,1,2,4,5,6,...|[4.02235172235172...|[0.10055879305879...|       1.0|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [36]:
result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()

1287

In [45]:
result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()/result.count()

0.8880822746521476

In [14]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score



In [15]:
dfp = tf.toPandas()

In [16]:
dfp.head(2)

Unnamed: 0,label,row,vectors
0,0.0,"[�, s, �, t, �, f, c, n, �, e, �, �, s, �, �, ...","(0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ..."
1,1.0,"[x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ...","(3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ..."


In [17]:
clf = RandomForestClassifier(random_state=22, n_estimators = 30, min_samples_split=3, min_samples_leaf=2)

In [18]:
X = dfp['vectors'].tolist()

In [19]:
y = dfp['label'].tolist()

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [21]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=22, verbose=0, warm_start=False)

In [23]:
print(clf.score(X_test, y_test))

0.9218461538461539


In [32]:
scores = cross_val_score(clf, X, y, cv=10)

In [33]:
scores.mean()

0.8905588981998195

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mushroom').getOrCreate()

In [32]:
df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/stock.csv',encoding='gbk',header=True, inferSchema=True)

In [33]:
df.dtypes

[('日期', 'timestamp'),
 ('股票代码', 'string'),
 ('名称', 'string'),
 ('收盘价', 'double'),
 ('最高价', 'double'),
 ('最低价', 'double'),
 ('开盘价', 'double'),
 ('前收盘', 'double'),
 ('涨跌额', 'string'),
 ('涨跌幅', 'string'),
 ('换手率', 'double'),
 ('成交量', 'int'),
 ('成交金额', 'double'),
 ('总市值', 'double'),
 ('流通市值', 'double')]

In [34]:
# from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType ,DoubleType # 导入类型
# schema = StructType([
#     StructField("日期", DateType(), True),
#     StructField("收盘价", DoubleType(), True),
#     StructField("成交量", LongType(), True),
#     StructField("名称", StringType(), True)
# ])

In [35]:
df.write.csv(path='hdfs:///user/csv/stock.csv', header=True, sep=",", mode='overwrite')

In [49]:
df.columns[1]

'股票代码'

In [61]:
df0 = spark.read.jdbc(url="jdbc:mysql://localhost:3306/test?user=root&password=666666", table="mashroom")

In [63]:
df0.count()

8124

In [64]:
df0.write.jdbc(url="jdbc:mysql://localhost:3306/test?user=root&password=666666&useUnicode=true&characterEncoding=GBK",
              mode="overwrite",
              table="test",
              properties={"driver":'com.mysql.jdbc.Driver'})

In [65]:
spark.stop()

In [68]:
spark = SparkSession.builder.enableHiveSupport().master("local[*]").appName("read_hive").getOrCreate()

df=spark.sql("select * from age")
df.show()

+--------------+------+
|       country|median|
+--------------+------+
|   New Zealand|  39.0|
|         Spain|  37.0|
|       Ireland|  35.0|
|        Sweden|  34.0|
|         Italy|  34.0|
|        Norway|  34.0|
|       Denmark|  34.0|
|        Israel|  34.0|
|     Australia|  34.0|
|   Netherlands|  34.0|
|     Argentina|  33.5|
|        Canada|  33.5|
|       Belgium|  33.0|
|   Switzerland|  33.0|
|         Japan|  33.0|
|United Kingdom|  33.0|
| United States|  32.0|
|      Portugal|  32.0|
|       Romania|  32.0|
|       Germany|  31.0|
+--------------+------+
only showing top 20 rows



In [87]:
spark.sql('create table if not exists age2(name string, num int)')
#df0.write.mode("overwrite").insertInto("age2")

DataFrame[]

In [80]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|      age|      false|
| default|     age2|      false|
| default|  country|      false|
| default|       qn|      false|
+--------+---------+-----------+



In [81]:
df.write.mode("overwrite").insertInto("age2")

In [86]:
spark.sql('select * from age2 sort by num limit 10 ').show()

+-----------+---+
|       name|num|
+-----------+---+
|New Zealand| 39|
|      Spain| 37|
|    Ireland| 35|
|     Sweden| 34|
|      Italy| 34|
|     Norway| 34|
|    Denmark| 34|
|     Israel| 34|
|  Australia| 34|
|Netherlands| 34|
+-----------+---+



In [18]:
spark.stop()