# Analysis of Events per Drive and Feature Engineering in Pandas
This project aims to turn the signals gathered from the vehicle into features and prepare the data for analysis. Finally, ML models shall be build and insights generated.

## Importing Data

In [None]:
#importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
#importing dataframe
df = pd.read_csv('Events_per_drive.csv')
df.head()



NameError: name 'pd' is not defined

In [None]:
df.isnull().any().any()

## Aggregating the data

In [None]:
#defining function to sum up values of the features in a dataframe grouped by a column and return an aggregated dataframe
def aggregate_sum(dataframe, column_name):
    df = dataframe.groupby([column_name]).sum()

    return df


#defining function to calculate ratios of values in features in a df by column -- I BUILD THIS MYSELF HAHA, TOOK 1 HOUR
def aggregate_ratio(dataframe, column_name):

    #creating placeholder list
    placeholder_lst = []

    #looping over unique values of the selected column
    for item in dataframe[column_name].unique():

        #filtering for unique value in df and storing in new df
        df = dataframe.loc[:,:][dataframe.loc[:, column_name] == item]

        placeholder_lst2 = []

        #looping over columns
        for column in df.columns:


            if column == column_name:
                placeholder_lst2.append(item)

            else:

                #calculating ratio for values that are higher than 0
                numerator = (df[column] > 0).value_counts()[1]
                denominator = df[column].count()
                ratio = round((numerator / denominator), 2)
                placeholder_lst2.append(ratio)

        placeholder_lst.append(placeholder_lst2)

    data_final = pd.DataFrame(placeholder_lst, columns = df.columns)

    return data_final





In [None]:
aggregate_ratio(df, 'VIN')

In [None]:
for item in df['VIN'].unique():
    print(item)

In [None]:
df.groupby(['VIN']).mean()

In [None]:
df.groupby(['VIN']).count()

In [None]:
df.groupby(['VIN']).sum()

In [None]:
aggregate_sum(df, 'VIN')

# Analysis of Events per Drive and Feature Engineering in PySpark

## Loading data

In [55]:
#importing necessary packages
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

In [32]:
spark = SparkSession.builder.getOrCreate()

In [33]:
schema = StructType([
    StructField("VIN", StringType()),
    StructField("Navigation active", IntegerType()),
    StructField("Seat Heating on", IntegerType()),
    StructField("Audio muted", IntegerType())
])

df = spark.read.csv("Events_per_drive.csv", header = True, schema=schema)

In [34]:
df.show(5)

+-----------------+-----------------+---------------+-----------+
|              VIN|Navigation active|Seat Heating on|Audio muted|
+-----------------+-----------------+---------------+-----------+
|5J8TB4H38Fl002262|                1|              1|          0|
|5J8TB4H38Fl002262|                1|              0|          0|
|5J8TB4H38Fl002262|                1|              0|          1|
|5J8TB4H38Fl002262|                1|              0|          0|
|5J8TB4H38Fl002262|                0|              1|          1|
+-----------------+-----------------+---------------+-----------+
only showing top 5 rows



In [35]:
df.dtypes

[('VIN', 'string'),
 ('Navigation active', 'int'),
 ('Seat Heating on', 'int'),
 ('Audio muted', 'int')]

## User Defined functions for aggregation of Data

In [40]:
#function to sum the values of the rows per column and return a dataframe grouped by one column
def agg_sum(dataframe, column_name):
    
    df = dataframe.groupBy(column_name).sum()

    return df

#function to check if values in column are bigger than 0 and calculate ratio of those in respect to all entries of one column
def agg_ratio(dataframe, column_name):

    for column in dataframe.columns:

        if column == column_name:
            continue
        else:
            dataframe = dataframe.withColumn(column+'at all', (F.col(column) >= 1).cast("integer"))
            dataframe = dataframe.drop(column)

    
    df = dataframe.groupBy(column_name).mean()

    return df

    


In [37]:
agg_sum(df, 'VIN').show()

+-----------------+----------------------+--------------------+----------------+
|              VIN|sum(Navigation active)|sum(Seat Heating on)|sum(Audio muted)|
+-----------------+----------------------+--------------------+----------------+
|5J8TB4H38Fl002262|                    18|                  16|              43|
+-----------------+----------------------+--------------------+----------------+



In [38]:
#aggregate functions pyspark
df.groupBy('VIN').sum().show()



+-----------------+----------------------+--------------------+----------------+
|              VIN|sum(Navigation active)|sum(Seat Heating on)|sum(Audio muted)|
+-----------------+----------------------+--------------------+----------------+
|5J8TB4H38Fl002262|                    18|                  16|              43|
+-----------------+----------------------+--------------------+----------------+



In [39]:
df.groupBy('VIN').mean().show()

+-----------------+----------------------+--------------------+------------------+
|              VIN|avg(Navigation active)|avg(Seat Heating on)|  avg(Audio muted)|
+-----------------+----------------------+--------------------+------------------+
|5J8TB4H38Fl002262|                   0.6|  0.5333333333333333|1.4333333333333333|
+-----------------+----------------------+--------------------+------------------+



In [51]:
for item in df.select('VIN').distinct():
    print(item)

Column<'VIN'>


In [85]:
agg_ratio(df, 'VIN').show()

+-----------------+----------------------------+--------------------------+----------------------+
|              VIN|avg(Navigation activeat all)|avg(Seat Heating onat all)|avg(Audio mutedat all)|
+-----------------+----------------------------+--------------------------+----------------------+
|5J8TB4H38Fl002262|                         0.6|        0.5333333333333333|                   0.6|
+-----------------+----------------------------+--------------------------+----------------------+



In [60]:
df2.show()

+-----------------+-----------------+---------------+-----------+------------------+
|              VIN|Navigation active|Seat Heating on|Audio muted|Audio muted at all|
+-----------------+-----------------+---------------+-----------+------------------+
|5J8TB4H38Fl002262|                1|              1|          0|             false|
|5J8TB4H38Fl002262|                1|              0|          0|             false|
|5J8TB4H38Fl002262|                1|              0|          1|              true|
|5J8TB4H38Fl002262|                1|              0|          0|             false|
|5J8TB4H38Fl002262|                0|              1|          1|              true|
|5J8TB4H38Fl002262|                0|              0|          3|              true|
|5J8TB4H38Fl002262|                0|              0|          4|              true|
|5J8TB4H38Fl002262|                0|              1|          2|              true|
|5J8TB4H38Fl002262|                1|              1|          1|

In [62]:
for column in df.columns:
    print(column)

VIN
Navigation active
Seat Heating on
Audio muted
