**Column Description**


| Column     | Type       | Description |
|--------  |---------  |: --------- |
| **BibNum** | Integer | Customer ID |
| **Title** | String | Whether the customer is a male or a female |
| **Author** | String | Whether the customer is a senior citizen or not (1, 0) |
| **ISBN** | String | Whether the customer has a partner or not (Yes, No) |
| **PublicationYear** | String | Whether the customer has dependents or not (Yes, No) |
| **Publisher** | String | Number of months the customer has stayed with the company |
| **Subjects** | String | Whether the customer has a phone service or not (Yes, No) |
| **ItemType** | String | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| **ItemCollection** | String | Customer’s internet service provider (DSL, Fiber optic, No) |
| **FloatingItem** | String | Whether the customer has online security or not (Yes, No, No internet service) |
| **ItemLocation** | String | Whether the customer has online backup or not (Yes, No, No internet service) |
| **ReportDate** | String | Whether the customer has device protection or not (Yes, No, No internet service) |
| **ItemCount** | String | Whether the customer has tech support or not (Yes, No, No internet service) |



In [13]:
!pip install pandas

/usr/bin/sh: 1: pip: not found


In [49]:
from pyspark.sql.types import *
import numpy as np
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
#import pandas as pd

In [5]:
customSchema = StructType([
  StructField("BibNum", IntegerType(), True),
  StructField("Title", StringType(), True),
  StructField("Author", StringType(), True),
  StructField("ISBN", StringType(), True),
  StructField("PublicationYear", StringType(), True),
  StructField("Publisher", StringType(), True),
  StructField("Subjects", StringType(), True),
  StructField("ItemType", StringType(), True),
  StructField("ItemCollection", StringType(), True),
  StructField("FloatingItem", StringType(), True),
  StructField("ItemLocation", StringType(), True),
  StructField("ReportDate", StringType(), True),
  StructField("ItemCount", StringType(), True)]
)

In [8]:
df = spark.read.format("csv")\
.schema(customSchema)\
.load("library-collection-inventory-sample.csv")
df.dataframeName='seattle-library-sample'
df.cache()

DataFrame[BibNum: int, Title: string, Author: string, ISBN: string, PublicationYear: string, Publisher: string, Subjects: string, ItemType: string, ItemCollection: string, FloatingItem: string, ItemLocation: string, ReportDate: string, ItemCount: string]

In [9]:
df.printSchema()
cols = df.columns
df.count()

root
 |-- BibNum: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- ItemCollection: string (nullable = true)
 |-- FloatingItem: string (nullable = true)
 |-- ItemLocation: string (nullable = true)
 |-- ReportDate: string (nullable = true)
 |-- ItemCount: string (nullable = true)



355807

In [10]:
df.show(2)

+------+--------------------+------------------+----+---------------+-----------+-----------+--------+--------------+------------+------------+--------------------+---------+
|BibNum|               Title|            Author|ISBN|PublicationYear|  Publisher|   Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|          ReportDate|ItemCount|
+------+--------------------+------------------+----+---------------+-----------+-----------+--------+--------------+------------+------------+--------------------+---------+
|  4750|My maternal ances...|Wood, Melba, 1913-|null|   [1968]-1987.|   M. Wood,|Hale family|    arbk|         caref|          NA|         cen|2017-09-01T00:00:...|        1|
|  6720|The White House m...|    Feiffer, Jules|null|         [1970]|Grove Press|       null|    acbk|          canf|          NA|         cen|2017-09-01T00:00:...|        1|
+------+--------------------+------------------+----+---------------+-----------+-----------+--------+--------------+--------

In [11]:
pd.DataFrame(df.take(5), columns=cols)

NameError: name 'pd' is not defined

In [14]:
df.take(5)

[Row(BibNum=4750, Title='My maternal ancestry : Dean, Matlock, Hale, Gahr families (in Tennessee and Missouri) / by Melba Wood.', Author='Wood, Melba, 1913-', ISBN=None, PublicationYear='[1968]-1987.', Publisher='M. Wood,', Subjects='Hale family', ItemType='arbk', ItemCollection='caref', FloatingItem='NA', ItemLocation='cen', ReportDate='2017-09-01T00:00:00.000', ItemCount='1'),
 Row(BibNum=6720, Title='The White House murder case: a play in two acts & Dick and Jane: a one-act play.', Author='Feiffer, Jules', ISBN=None, PublicationYear='[1970]', Publisher='Grove Press', Subjects=None, ItemType='acbk', ItemCollection='canf', FloatingItem='NA', ItemLocation='cen', ReportDate='2017-09-01T00:00:00.000', ItemCount='1'),
 Row(BibNum=6845, Title='Autumn of glory; the Army of Tennessee, 1862-1865.', Author='Connelly, Thomas Lawrence', ISBN='0807104450', PublicationYear='[1971]', Publisher='Louisiana State University Press', Subjects='Confederate States of America Army Department of Tennessee, 

In [16]:
featureColumns = ['Author','Publisher','ItemType','ItemCollection',
 'ItemLocation']

In [17]:
responses = df.groupBy('Subjects').count().collect()

In [22]:
categories = [i[0] for i in responses]
counts = [i[1] for i in responses]
 
ind = np.array(range(len(categories)))
width = 0.35
plt.bar(ind, counts, width=width, color='r')
 
plt.ylabel('counts')
plt.title('Stroke')
plt.xticks(ind + width/2., categories)

NameError: name 'plt' is not defined

In [27]:
df.select("Subjects").distinct().count()

197066

In [28]:
df.take(50)

[Row(BibNum=4750, Title='My maternal ancestry : Dean, Matlock, Hale, Gahr families (in Tennessee and Missouri) / by Melba Wood.', Author='Wood, Melba, 1913-', ISBN=None, PublicationYear='[1968]-1987.', Publisher='M. Wood,', Subjects='Hale family', ItemType='arbk', ItemCollection='caref', FloatingItem='NA', ItemLocation='cen', ReportDate='2017-09-01T00:00:00.000', ItemCount='1'),
 Row(BibNum=6720, Title='The White House murder case: a play in two acts & Dick and Jane: a one-act play.', Author='Feiffer, Jules', ISBN=None, PublicationYear='[1970]', Publisher='Grove Press', Subjects=None, ItemType='acbk', ItemCollection='canf', FloatingItem='NA', ItemLocation='cen', ReportDate='2017-09-01T00:00:00.000', ItemCount='1'),
 Row(BibNum=6845, Title='Autumn of glory; the Army of Tennessee, 1862-1865.', Author='Connelly, Thomas Lawrence', ISBN='0807104450', PublicationYear='[1971]', Publisher='Louisiana State University Press', Subjects='Confederate States of America Army Department of Tennessee, 

In [41]:
#Count nulls
df.select([count(when(isnan('Subjects') | col('Subjects').isNull() , True))]).show()
df.select([count(when(isnan('ItemType') | col('ItemType').isNull() , True))]).show()
df.select([count(when(isnan('ItemCollection') | col('ItemCollection').isNull() , True))]).show()

+----------------------------------------------------------------------+
|count(CASE WHEN (isnan(Subjects) OR (Subjects IS NULL)) THEN true END)|
+----------------------------------------------------------------------+
|                                                                     0|
+----------------------------------------------------------------------+

+----------------------------------------------------------------------+
|count(CASE WHEN (isnan(ItemType) OR (ItemType IS NULL)) THEN true END)|
+----------------------------------------------------------------------+
|                                                                   121|
+----------------------------------------------------------------------+

+----------------------------------------------------------------------------------+
|count(CASE WHEN (isnan(ItemCollection) OR (ItemCollection IS NULL)) THEN true END)|
+----------------------------------------------------------------------------------+
|            

In [36]:
x = df.count()

In [37]:
x - 8322

347485

In [42]:
df = df.na.drop(subset=["Subjects"]) ## Drop null values
df = df.na.drop(subset=["ItemType"]) ## Drop null values
df = df.na.drop(subset=["ItemCollection"]) ## Drop null values

In [43]:
df.count()

347352

In [47]:
item_collection_indexer = StringIndexer(inputCol="ItemCollection", outputCol="ItemCollectionIndex")#Fits a model to the input dataset with optional parameters.
df1 = item_collection_indexer.fit(df).transform(df)
df1.show()

+------+--------------------+--------------------+-------------+----------------+--------------------+--------------------+--------+--------------+------------+------------+--------------------+---------+-------------------+
|BibNum|               Title|              Author|         ISBN| PublicationYear|           Publisher|            Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|          ReportDate|ItemCount|ItemCollectionIndex|
+------+--------------------+--------------------+-------------+----------------+--------------------+--------------------+--------+--------------+------------+------------+--------------------+---------+-------------------+
|  4750|My maternal ances...|  Wood, Melba, 1913-|         null|    [1968]-1987.|            M. Wood,|         Hale family|    arbk|         caref|          NA|         cen|2017-09-01T00:00:...|        1|                7.0|
|  6845|Autumn of glory; ...|Connelly, Thomas ...|   0807104450|          [1971]|Louisiana State U..

In [45]:
#encoder = OneHotEncoder()\
#.setInputCol("ItemCollection")\
#.setOutputCol("ItemCollection")
#encoder.transform(df.select("ItemCollection")).show()

In [48]:
#onehotencoder to qualificationIndex
onehotencoder_item_collection_vector = OneHotEncoder(inputCol="ItemCollectionIndex", outputCol="ItemCollection_vec")
df11 = onehotencoder_item_collection_vector.fit(df1).transform(df1)
df11.show()

+------+--------------------+--------------------+-------------+----------------+--------------------+--------------------+--------+--------------+------------+------------+--------------------+---------+-------------------+------------------+
|BibNum|               Title|              Author|         ISBN| PublicationYear|           Publisher|            Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|          ReportDate|ItemCount|ItemCollectionIndex|ItemCollection_vec|
+------+--------------------+--------------------+-------------+----------------+--------------------+--------------------+--------+--------------+------------+------------+--------------------+---------+-------------------+------------------+
|  4750|My maternal ances...|  Wood, Melba, 1913-|         null|    [1968]-1987.|            M. Wood,|         Hale family|    arbk|         caref|          NA|         cen|2017-09-01T00:00:...|        1|                7.0|   (547,[7],[1.0])|
|  6845|Autumn of glory;

In [50]:
#Create pipeline and pass all stages
pipeline = Pipeline(stages=[item_collection_indexer,
                            onehotencoder_item_collection_vector
                    ])

In [51]:
df_transformed = pipeline.fit(df).transform(df)
df_transformed.show()

+------+--------------------+--------------------+-------------+----------------+--------------------+--------------------+--------+--------------+------------+------------+--------------------+---------+-------------------+------------------+
|BibNum|               Title|              Author|         ISBN| PublicationYear|           Publisher|            Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|          ReportDate|ItemCount|ItemCollectionIndex|ItemCollection_vec|
+------+--------------------+--------------------+-------------+----------------+--------------------+--------------------+--------+--------------+------------+------------+--------------------+---------+-------------------+------------------+
|  4750|My maternal ances...|  Wood, Melba, 1913-|         null|    [1968]-1987.|            M. Wood,|         Hale family|    arbk|         caref|          NA|         cen|2017-09-01T00:00:...|        1|                7.0|   (547,[7],[1.0])|
|  6845|Autumn of glory;

In [54]:
df_transformed.select("ItemCollectionIndex").show()

+-------------------+
|ItemCollectionIndex|
+-------------------+
|                7.0|
|                0.0|
|                7.0|
|               17.0|
|                0.0|
|                7.0|
|              103.0|
|               55.0|
|               15.0|
|                0.0|
|               20.0|
|                2.0|
|                2.0|
|               85.0|
|               15.0|
|               52.0|
|                7.0|
|                0.0|
|                7.0|
|               27.0|
+-------------------+
only showing top 20 rows



In [None]:
#from pyspark.ml.feature import RFormula
#supervised = RFormula(formula="label ~ . + Churn:TotalCharges + Churn:MonthlyCharges + Churn:SeniorCitizen")