In [1]:
import getpass
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df_iris = pd.read_csv("data/iris.csv")
df_iris.shape

(150, 5)

In [3]:
df_iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [4]:
Label = LabelEncoder()

Label.fit(df_iris['variety'])
df_iris['variety'] = Label.transform(df_iris['variety'])

In [5]:
df_iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
df_iris.variety.value_counts()

0    50
1    50
2    50
Name: variety, dtype: int64

In [7]:
df_iris.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety           int64
dtype: object

In [None]:
#df_iris['variety'] = df_iris.variety.apply(lambda x: "'"+x+"'")

In [8]:
import mysql.connector

In [9]:
# input password

print("Enter password:")
password = getpass.getpass()

Enter password:
········


In [10]:
db_connection = mysql.connector.connect(host = '127.0.0.1',port = 3306, user = 'root', 
                                        passwd = password,db = 'mysql')
#db_cursor = db_connection.cursor()
#db_cursor.execute("CREATE DATABASE TestDB;")
#db_cursor.execute("USE TestDB;")

In [11]:
db_cursor = db_connection.cursor()
db_cursor.execute("CREATE DATABASE TestDB;")
db_cursor.execute("USE TestDB;")

In [12]:
# First time create it

db_cursor.execute("CREATE TABLE iris(sepal_length DECIMAL(2,1) NOT NULL, \
                   sepal_width DECIMAL(2,1) NOT NULL, petal_length DECIMAL(2,1) NOT NULL, \
                   petal_width DECIMAL(2,1), species INT);")

In [13]:
iris_tuples = list(df_iris.itertuples(index=False, name=None))
iris_tuples_string = ",".join(["(" + ",".join([str(w) for w in wt]) + ")" for wt in iris_tuples])

In [14]:
db_cursor.execute("INSERT INTO iris(sepal_length, sepal_width, petal_length, \
                   petal_width, species) VALUES " + iris_tuples_string + ";")


db_cursor.execute("FLUSH TABLES;")

In [15]:
db_cursor

<mysql.connector.cursor.MySQLCursor at 0x7f872c27e190>

In [16]:
# Execute query
sql = "SELECT * FROM iris"
db_cursor.execute(sql)

# Fetch all the records
tuples = db_cursor.fetchall()

# Preparing list of columns for dataframe

cols = list(df_iris.columns)
df_temp = pd.DataFrame(tuples,columns=cols)

print(df_temp.head(10))

  sepal.length sepal.width petal.length petal.width  variety
0          5.1         3.5          1.4         0.2        0
1          4.9         3.0          1.4         0.2        0
2          4.7         3.2          1.3         0.2        0
3          4.6         3.1          1.5         0.2        0
4          5.0         3.6          1.4         0.2        0
5          5.4         3.9          1.7         0.4        0
6          4.6         3.4          1.4         0.3        0
7          5.0         3.4          1.5         0.2        0
8          4.4         2.9          1.4         0.2        0
9          4.9         3.1          1.5         0.1        0


In [17]:
from pyspark.sql import SparkSession

In [18]:
spark = SparkSession.builder.config("spark.jars", "/usr/share/java/mysql-connector-java-8.0.22.jar") \
    .master("local").appName("PySpark_MySQL_test").getOrCreate()

In [19]:
iris_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/TestDB") \
         .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "iris") \
         .option("user", "root").option("password", password).load()

In [20]:
iris_df

DataFrame[sepal_length: decimal(2,1), sepal_width: decimal(2,1), petal_length: decimal(2,1), petal_width: decimal(2,1), species: int]

In [21]:
from pyspark.ml.feature import VectorAssembler

train_df, test_df = iris_df.randomSplit([.8, .2], seed=12345)
predictors = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
vec_assembler = VectorAssembler(inputCols=predictors, outputCol="features")
vec_train_df = vec_assembler.transform(train_df)
vec_train_df.select("features", "species").show(5)

+-----------------+-------+
|         features|species|
+-----------------+-------+
|[4.3,3.0,1.1,0.1]|      0|
|[4.4,2.9,1.4,0.2]|      0|
|[4.4,3.0,1.3,0.2]|      0|
|[4.4,3.2,1.3,0.2]|      0|
|[4.5,2.3,1.3,0.3]|      0|
+-----------------+-------+
only showing top 5 rows



In [22]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="species", featuresCol="features")
lr_model = lr.fit(vec_train_df)
vec_test_df = vec_assembler.transform(test_df)
predictions = lr_model.transform(vec_test_df)

In [23]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vec_assembler, lr])
pipeline_model = pipeline.fit(train_df)
predictions = pipeline_model.transform(test_df)

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="species")
evaluator.evaluate(predictions)

1.0