# Analysis of C-Class dataset in PySpark

## Loading the data

In [1]:
#importing necessary packages
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, DecimalType

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
schema = StructType([
    StructField('model', StringType()),
    StructField('year', IntegerType()),
    StructField('price', IntegerType()),
    StructField('transmission', StringType()),
    StructField('mileage', IntegerType()),
    StructField('fuelType', StringType()),
    StructField('engineSize', DecimalType())
])

df = spark.read.csv('cclass.csv', header = True, schema = schema)

In [4]:
df.show()

+--------+----+-----+------------+-------+--------+----------+
|   model|year|price|transmission|mileage|fuelType|engineSize|
+--------+----+-----+------------+-------+--------+----------+
| C Class|2020|30495|   Automatic|   1200|  Diesel|         2|
| C Class|2020|29989|   Automatic|   1000|  Petrol|         2|
| C Class|2020|37899|   Automatic|    500|  Diesel|         2|
| C Class|2019|30399|   Automatic|   5000|  Diesel|         2|
| C Class|2019|29899|   Automatic|   4500|  Diesel|         2|
| C Class|2020|30999|   Automatic|   1000|  Diesel|         2|
| C Class|2020|35999|   Automatic|    500|  Diesel|         2|
| C Class|2019|37990|   Automatic|   1412|  Petrol|         3|
| C Class|2019|28990|   Automatic|   3569|  Diesel|         2|
| C Class|2019|28990|   Automatic|   3635|  Diesel|         2|
| C Class|2013| 9995|   Automatic|  44900|  Petrol|         2|
| C Class|2012| 6995|   Automatic|  88200|  Diesel|         2|
| C Class|2012| 7495|   Automatic| 115000|  Diesel|    

In [5]:
df.dtypes

[('model', 'string'),
 ('year', 'int'),
 ('price', 'int'),
 ('transmission', 'string'),
 ('mileage', 'int'),
 ('fuelType', 'string'),
 ('engineSize', 'decimal(10,0)')]

In [6]:
#examining Missing values
df.select([F.count(F.when(F.isnan(c), c)).alias(c)for c in df.columns]).show()

+-----+----+-----+------------+-------+--------+----------+
|model|year|price|transmission|mileage|fuelType|engineSize|
+-----+----+-----+------------+-------+--------+----------+
|    0|   0|    0|           0|      0|       0|         0|
+-----+----+-----+------------+-------+--------+----------+



In [7]:
df.count()

3899

In [8]:
df.summary().show()

+-------+--------+------------------+-----------------+------------+------------------+--------+-------------------+
|summary|   model|              year|            price|transmission|           mileage|fuelType|         engineSize|
+-------+--------+------------------+-----------------+------------+------------------+--------+-------------------+
|  count|    3899|              3899|             3899|        3899|              3899|    3899|               3899|
|   mean|    null|2017.3385483457296|23674.28699666581|        null|22395.709156193894|    null|             2.1036|
| stddev|    null|2.2134156573374724| 8960.21821842348|        null|22630.438425876873|    null|0.41648280841854707|
|    min| C Class|              1991|             1290|   Automatic|                 1|  Diesel|                  0|
|    25%|    null|              2016|            17690|        null|              6000|    null|                2.0|
|    50%|    null|              2018|            22980|        n

In [None]:
df2 = df.toPandas()

In [None]:
#importing neccesary packages


### Dropping model column

In [12]:
df = df.drop('model')

In [13]:
df.show()

+----+-----+------------+-------+--------+----------+
|year|price|transmission|mileage|fuelType|engineSize|
+----+-----+------------+-------+--------+----------+
|2020|30495|   Automatic|   1200|  Diesel|         2|
|2020|29989|   Automatic|   1000|  Petrol|         2|
|2020|37899|   Automatic|    500|  Diesel|         2|
|2019|30399|   Automatic|   5000|  Diesel|         2|
|2019|29899|   Automatic|   4500|  Diesel|         2|
|2020|30999|   Automatic|   1000|  Diesel|         2|
|2020|35999|   Automatic|    500|  Diesel|         2|
|2019|37990|   Automatic|   1412|  Petrol|         3|
|2019|28990|   Automatic|   3569|  Diesel|         2|
|2019|28990|   Automatic|   3635|  Diesel|         2|
|2013| 9995|   Automatic|  44900|  Petrol|         2|
|2012| 6995|   Automatic|  88200|  Diesel|         2|
|2012| 7495|   Automatic| 115000|  Diesel|         2|
|2011| 8995|   Automatic|  69250|  Diesel|         2|
|2015|14995|   Automatic|  49850|  Diesel|         2|
|2013| 8595|   Automatic|  8

### Examining Skewness

In [None]:
df.agg({'price':'skewness'}).show()

#If between -0.5 and 0.5 it is fairly symmetrical
# if between -1 and 1 it is moderately skwewd
#if between smaller than -1 and biger than 1 then highly skwewd


In [None]:
#if positively sweked, log transformation can help
df = df.withColumn('price_log', F.log10(F.col('price')))

In [None]:
df.show()

## Normalization of Data

In [None]:
#Min Max Scaler
from pyspark.ml.feature import MinMaxScaler

#First VectorAssembler
#Then MinMaxScaler
#pipelining
# Iterating over columns to be scaled
for i in ["Revenue","No_of_Days"]:
    # VectorAssembler Transformation - Converting column to vector type
    assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")

    # MinMaxScaler Transformation
    scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")

    # Pipeline of VectorAssembler and MinMaxScaler
    pipeline = Pipeline(stages=[assembler, scaler])

    # Fitting pipeline on dataframe
    df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")


In [None]:
#Fabi Normalization function

# Function to normalise dataframes
def standardize_train_test_data(train_df, test_df, columns):
    '''
    Add normalised columns to the input dataframe.
    formula = [(X - mean) / std_dev]
    Inputs : training dataframe, list of column name strings to be normalised
    Returns : dataframe with new normalised columns, averages and std deviation dataframes 
    '''
    # Find the Mean and the Standard Deviation for each column
    aggExpr = []
    aggStd = []
    for column in columns:
        aggExpr.append(mean(train_df[column]).alias(column))
        aggStd.append(stddev(train_df[column]).alias(column + '_stddev'))
    
    averages = train_df.agg(*aggExpr).collect()[0]
    std_devs = train_df.agg(*aggStd).collect()[0]
    
    # Standardise each dataframe, column by column
    for column in columns:            
        # Standardise the TRAINING data
        train_df = train_df.withColumn(column + '_norm', ((train_df[column] - averages[column]) / 
                                                              std_devs[column + '_stddev']))       
    
        # Standardise the TEST data (using the training mean and std_dev)     
        test_df = test_df.withColumn(column + '_norm', ((test_df[column] - averages[column]) / 
                                                              std_devs[column + '_stddev']))  
    return train_df, test_df, averages, std_devs


### One Hot Encoding of categorical variables

In [19]:
#importing necessary packages
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline


In [21]:
train, test = df.randomSplit([0.8, 0.2], seed = 42)

In [22]:
#Converting categorical strings to index values
indexer_trans = StringIndexer(inputCol = 'transmission', outputCol = 'trans_idx')
indexer_fuel = StringIndexer(inputCol = 'fuelType', outputCol = 'fuel_idx')


#One-Hot encode index values
onehot = OneHotEncoder(
    inputCols = ['trans,idx, fuel_idx'],
    outputCols = ['trans_dummy', 'fuel_dummy']
)

#Assemble predictors into a single column
assembler = VectorAssembler(inputCols = ['year', 'mileage', 'engineSize', 'trans_dummy', 'fuel_dummy'], outputCol = 'features')

#Linear regression object
regression = LinearRegression(labelCol = 'price')

#Constructing pipeline
pipeline = Pipeline(stages = [indexer_trans, indexer_fuel, onehot, assembler, regression])

#Training pipeline on training data
pipeline = pipeline.fit(train)

#Making predictions on testing data
predictions = pipeline.transform(test)




IllegalArgumentException: requirement failed: The number of input columns 1 must be the same as the number of output columns 2.