<center style="font-size:48px;">Exploratory Data Analysis</center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import folium
from folium.plugins import HeatMap, HeatMapWithTime, MousePosition
import plotly.express as pex

cars = pd.read_csv('../Data/car-assignments.csv')
cc = pd.read_csv('../Data/cc_data.csv', encoding='cp1252')
gps = pd.read_csv('../Data/gps.csv', parse_dates=['Timestamp'])
loyalty = pd.read_csv('../Data/loyalty_data.csv', encoding='cp1252')

plt.style.use('fivethirtyeight')

## Cars

In [None]:
cars.head()

In [None]:
cars.info()

In [None]:
# See if we have any duplicate entries
print("There are {} duplicate entries for a person".format(cars[['FirstName', 'LastName']].duplicated().sum()))

In [None]:
# Find the number of employees for each job type
plt.subplots(figsize = (15,10))
sns.countplot(x= 'CurrentEmploymentType', data = cars)
plt.title('Number of Employees with Job Types')
plt.show()
cars.groupby('CurrentEmploymentType')['LastName'].agg(['count']).sort_values('count', ascending=False)

The Engineering group as the most employees; 13. Executives and IT groups have the least; 5.

In [None]:
# Find the number of employees for each job title
plt.subplots(figsize = (15,10))
sns.countplot(x= 'CurrentEmploymentTitle', data = cars)
plt.title('Number of Employees per Job')
plt.xticks(rotation =90)
plt.show()
cars.groupby('CurrentEmploymentTitle')['LastName'].agg(['count']).sort_values('count', ascending=False)

Truck Drivers have the most employees; 9. Most job titles only have one employees.

In [None]:
# Job titles that belong in a job type
grouped = cars[['CurrentEmploymentTitle', 'CurrentEmploymentType']].drop_duplicates()
plt.subplots(figsize = (15,10))
sns.countplot(x= 'CurrentEmploymentType', data = grouped)
plt.title('Number of Jobs per Type')
plt.show()
cars[['CurrentEmploymentTitle', 'CurrentEmploymentType']].groupby('CurrentEmploymentType').agg(['count'])

The Engineering group as the most job titles; 6. Facilities have the least; 2.

In [None]:
# Lets see which jobs are grouped into which tyoe
for types in cars['CurrentEmploymentType'].unique():
    job = cars[cars['CurrentEmploymentType'] == types]['CurrentEmploymentTitle'].unique()
    print("{} : {}".format(types, job))

In [None]:
# Are there any comminatlities to any of the null entries
cars[cars['CarID'].isnull()]

It looks like all the null entries are for the carID for the nine truck drivers. This makes sence as in the problem statement it says that the truck drivers are not assigned company cars that can be used for personal use.

## GPS

In [None]:
gps.head()

In [None]:
gps.info()

In [None]:
# Seperate the time units into their own column
timeUnit = ['year', 'month', 'day', 'hour', 'minute', 'second']
for unit in timeUnit:
    if unit == 'year':
        gps[unit] = gps['Timestamp'].apply(lambda x: x.year)
    if unit == 'month':
        gps[unit] = gps['Timestamp'].apply(lambda x: x.month)
    if unit == 'day':
        gps[unit] = gps['Timestamp'].apply(lambda x: x.day)
    if unit == 'hour':
        gps[unit] = gps['Timestamp'].apply(lambda x: x.hour)
    if unit == 'minute':
        gps[unit] = gps['Timestamp'].apply(lambda x: x.minute)
    if unit == 'second':
        gps[unit] = gps['Timestamp'].apply(lambda x: x.second)
gps.head()

In [None]:
gps.describe()

The year and month are contstants! They only contain the values 2014 and 1 respectively. 

In [None]:
# Get the deviations for each car from the median and mean longitude, latitude, and hour
meanLat= gps['lat'].mean()
medianLat= gps['lat'].median()
meanLong= gps['long'].mean()
medianLong= gps['long'].median()
meanHour= gps['hour'].mean()
medianHour= gps['hour'].median()

st = pd.DataFrame()
st['Mean_Lat'] = abs(gps.groupby('id')[['lat']].agg('mean').sub(meanLat))
st['Median_Lat'] = abs(gps.groupby('id')[['lat']].agg('mean').sub(medianLat))
st['Mean_Long'] = abs(gps.groupby('id')[['long']].agg('mean').sub(meanLong))
st['Median_Long'] = abs(gps.groupby('id')[['long']].agg('mean').sub(medianLong))
st['Mean_Hour'] = abs(gps.groupby('id')[['hour']].agg('mean').sub(meanHour))
st['Median_Hour'] = abs(gps.groupby('id')[['hour']].agg('mean').sub(medianHour))
st.style.background_gradient(cmap="winter", axis =0)

Car 104 seems to deviate from the average alot.  104 is a truck used by the truck drivers

In [None]:
# Correlations
fig, ax = plt.subplots(1, figsize=(15,10))
sns.heatmap(gps.corr(), vmin=-1, vmax= 1, annot=True, mask = np.triu(np.ones_like(gps.corr())))


Nothing looks highly correlated. Year and Month are whited out since they are constants.

In [None]:
fig = pex.violin(gps, x= 'long', box= True, title='Longitude Distribution')
fig.show()

In [None]:
fig = pex.violin(gps, x= 'lat', box= True, title='Latitude Distribution')
fig.show()

In [None]:
# Where are cars located over the entire dataset
m = folium.Map(location=[medianLat+.01, medianLong], zoom_start=14)
kronos = gpd.read_file('../Geospatial/Kronos_Island.shp')
folium.features.GeoJson(kronos,  style_function= lambda feature: {'fillColor' : 'black'}).add_to(m)
abila = gpd.read_file('../Geospatial/Abila.shp')
folium.features.GeoJson(abila, style_function= lambda feature: {'color' : 'white'}).add_to(m)
HeatMap(gps[['lat', 'long']], radius =20).add_to(m)
fmtr = "function(num) {return L.Util.formatNum(num, 3) + ' º ';};"
MousePosition(position='topleft', separator=' | ', prefix="Mouse:",
              lat_formatter=fmtr, lng_formatter=fmtr).add_to(m)
m

In [None]:
# Lets know look at the location heatmap segmented by hour
weights, index = [], []

for hour in sorted(gps.hour.unique()):
        weights.append(gps[gps.hour == hour][['lat', 'long']].groupby(['lat', 'long']).count().reset_index().values.tolist())
        index.append(pd.to_datetime(hour, format='%H').strftime('%I %p'))
[[ x.append(0.1) for x in y] for y in weights]
m = folium.Map(location=[medianLat+.01, medianLong], zoom_start=14)
kronos = gpd.read_file('../Geospatial/Kronos_Island.shp')
folium.features.GeoJson(kronos,  style_function= lambda feature: {'fillColor' : 'black'}).add_to(m)
abila = gpd.read_file('../Geospatial/Abila.shp')
folium.features.GeoJson(abila, style_function= lambda feature: {'color' : 'white'}).add_to(m)
HeatMapWithTime(weights, index = index).add_to(m)
MousePosition(position='topleft', separator=' | ', prefix="Mouse:",
              lat_formatter=fmtr, lng_formatter=fmtr).add_to(m)
m

In [None]:
gps.head()
gps[['lat', 'long', 'Timestamp']].resample(rule='15Min', on='Timestamp')

## Credit Cards

In [None]:
cc.head()

In [None]:
cc.info()

In [None]:
cc.describe()

In [None]:
# Lets look at the frequency of visits for each location
locs = cc.groupby('location')[['timestamp']].agg('count').sort_values('timestamp', ascending=False)
locs.columns = ['Count']
locs

Food places look popular. There are a few locations with few visits. 

In [None]:
fig = pex.violin(cc, x='price', box=True, points='all', hover_data=cc.columns, title='Credit Card Spending Distribution')
fig.show()
cc.groupby('location')['price'].agg(['mean', 'median']).sort_values('mean', ascending=False)

Most prices are under $200. We do have some places that tend to charge a lot, such as the airport. From the plot it looks like we have outliers

## Loyalty Data

In [None]:
loyalty.head()

In [None]:
loyalty.info()

In [None]:
loyalty.describe()

In [None]:
# Lets look at the frequency of visits for each location
locsLoyalty = loyalty.groupby('location')[['timestamp']].agg('count').sort_values('timestamp', ascending=False)
locsLoyalty.columns = ['Count']
locsLoyalty

There is similar ordering with slightly lower frequency.

In [None]:
fig = pex.violin(loyalty, x='price', box=True, points='all', hover_data=cc.columns, title='Loyalty Card Spending Distribution')
fig.show()
loyalty.groupby('location')['price'].agg(['mean', 'median']).sort_values('mean', ascending=False)

Same story as the credit cards prices. Prices do seem much lower with loyalty cards though

In [None]:
# Lets see if there are any locations that are only in one dataframe
uniqueCC = set(cc['location']).difference(set(loyalty['location']))
print('Unique locations in the credit card dataframe : {}'.format(uniqueCC))
uniqueLoyalty = set(loyalty['location']).difference(set(cc['location']))
print('Unique locations in the loyalty dataframe : {}'.format(uniqueLoyalty))

<div>
    <span  style="width:600px;display:inline-block;text-align:left">
        <a href="./ReadIn.ipynb">&#60;&#60;Reading the Data</a>
    </span>
    <span style="width:600px;display:inline-block;text-align:right">
        <a href="./FurtherEDA.ipynb">Further Exploratory Data Analysis&#62;&#62;</a>
    </span>
</div>
<div>
    <center>
        <span style="width:200px;display:inline-block;text-align:center">
            <a href="./Master.ipynb">Master Notebook</a>
        </span>
        <span style="width:200px;display:inline-block;text-align:center">
            <a href="../README.md">Table of Contents</a>
        </span>
    </center>
</div>