In [1]:
# Dependencies
from matplotlib.figure import Figure
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
np.random.seed(sum(map(ord, "aesthetics")))
# https://en.wikipedia.org/wiki/Bubble_chart
# https://plot.ly/matplotlib/bubble-charts/

In [2]:
# Store filepath in a variable
city_file = "../raw_data/city_data.csv"

In [3]:
# Read our Data file with the pandas library

# Note: city_data.csv had two records for Port James, both type 
# Suburban, and one driver_count = 3 and one = 15.  I deleted one 
# record, and made the driver count 18.  This fixes two issues, 1, 
# the record count for the combined data set is 2375 as should be, 
# and later dataframes created from city_df have the correct 
# record count of 125.
city_df = pd.read_csv(city_file)
city_df.sort_values("city",inplace=True)
city_df.head(5)

Unnamed: 0,city,driver_count,type
0,Alvarezhaven,21,Urban
1,Alyssaberg,67,Urban
2,Anitamouth,16,Suburban
3,Antoniomouth,21,Urban
4,Aprilchester,49,Urban


In [4]:
#len(city_df.index)

In [5]:
#city_df.info()
#city_df.describe
# For data analysis: Export file as an XLSX, w/o index, w/ header
#city_df.to_excel("../output/city_source.xlsx", index=False, header=True)

In [6]:
# Store filepath in a variable
ride_file = "../raw_data/ride_data.csv"

In [7]:
# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
ride_df = pd.read_csv(ride_file, parse_dates=["date"])
ride_df.sort_values("city",inplace=True)
ride_df.head(5)

Unnamed: 0,city,date,fare,ride_id
125,Alvarezhaven,2016-08-01 00:39:48,6.42,8394540350728
1032,Alvarezhaven,2016-01-21 07:25:48,22.83,3565582370530
702,Alvarezhaven,2016-06-21 09:57:52,20.88,9456788060940
1580,Alvarezhaven,2016-05-16 15:33:14,6.45,8939751998750
674,Alvarezhaven,2016-06-25 22:46:06,24.16,9047320468692


In [8]:
#len(ride_df.index)
#ride_df.dtypes
# For data analysis: Export file as an XLSX, w/o index, w/ header
#ride_df.to_excel("../output/ride_source.xlsx", index=False, header=True)

In [9]:
# Merge the two dataframes on the "city" field
# re-order columns: city, date, fare, ride_id, driver_count, type and sort by "city"
pd.options.display.max_rows = 100
pyber_df = pd.merge(city_df, ride_df, on=("city"), how="inner", suffixes=("_x", "_y"))
pyber_df = pyber_df[["city", "date", "fare", "ride_id", "driver_count", "type"]].sort_values("city")
pyber_df.head(5)

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Alvarezhaven,2016-08-01 00:39:48,6.42,8394540350728,21,Urban
30,Alvarezhaven,2016-05-15 20:43:44,40.04,1806812593131,21,Urban
29,Alvarezhaven,2016-02-07 02:46:18,35.22,5405756761666,21,Urban
28,Alvarezhaven,2016-08-18 07:12:06,20.74,357421158941,21,Urban
27,Alvarezhaven,2016-09-23 21:51:59,17.67,3829336915201,21,Urban


In [10]:
# Analyze combined data set
#len(pyber_df.index)
#pyber_df.info()
#pyber_df.describe
#pyber_df.dtypes
# For data analysis: Export file as an XLSX, w/o index, w/ header
pyber_df.to_excel("../output/pyber_source.xlsx", index=False, header=True)

In [11]:
#
# BEGIN DATA ORGANIZATION AND CHARTING
#

In [12]:
# Source Data Analysis: 
# When loading the data, there are 2375 rows in ride_data.csv, why then did I have 2407 rows when 
# I merge the city and ride data?  There were two records for Port James in city_data.csv, one 
# with 3 drivers and the other with 15 drivers.  Since, I was going to use this data set in future 
# processes, I made the decision to manually remove one record and change the driver count to 18 
# for the remaining record.  This change resulted reasonable records counts when mergiing, summing,
# or manipulating the data sets.

# Data Trend Analysis:
# 1)It might seem obvious but there are more people using Pyber in an urban setting.  It makes 
# sense that people may either may not have a car or don't want to use it in the city if using a 
# ride service is convenient and cost effective.  On the other side of usage of the ride service, 
# people in rural and suburban environments live there to not be in an non-urban environment, and 
# since public transportation services are limited to non-existent, inconvenient, and expensive 
# outside of cities, these people would be mostly self-sufficient in the transportation environment.  
# Drawing on my experience and knowledge, people in rural and suburban areas would use ride 
# services in special situations like going to the airport where you might not want to leave your 
# car, and since services that older people use are in urban areas they would tend to use a ride 
# service to get to those services.  And the difference between suburban and rural usage could be 
# related to how close one group is to a city and usage of urban service and thus the Pyber ride 
# service.

# 2)Cost surprised me a little.  I would have thought that urban users would do more shorter trips 
# resulting in more rides and more at a lower cost.  Not sure what is going on with the data here, 
# but I would look further into why that is.  I don't use a Pyber type ride service, so I don't 
# know the costs, but the rural users have a large number of rides but not too high in cost.  Maybe 
# short trips to the doctor or store for an older person?  Don't know because the data set doesn't 
# support a conclusion.  One of the data points has the highest number of rides and is a very low 
# average fair.  I would expect to look at this and a few other outliers to understand what is going 
# on with the data, and to understand and gain trust in the data; however, for the most part the 
# data has few outliers and is scattered in a reasonable manner.

# 3)As for the number of drivers, I was surprised by the size of some the gold bubbles because 
# these are related rural cities, and I would expect fewer drivers but that wasn't the case.  It 
# seems from looking at the color and size of the data that there are an adequete number of drivers 
# in a lot of cases for rural drivers.  Certainly, the suburban users had enough drivers to choose 
# from, and usage correlates or drives(supply and demand) the need for drivers which is also shown 
# by the urban bubbles.  I would expect more bubbles and bigger bubbles in a city and the bubble 
# chart supports this expectation.  From television, not experience, I have an idea that it is hard 
# for people to get a taxi, so if you look at the bubbles there is certainly more demand and supply 
# as the environment moves from rural to urban.  Is the real or perceived lack of taxis driving the 
# demand for a Pyber like service?  It might be good to look further into why people choose a Pyber 
# type service over a taxi.  In any case, the number of drivers looks adequete for all environments.


In [13]:
# Bubble Plot of Pyber Data
# Collect data for the bubble plot

## Average Fare ($) Per City is the y-axis
## group by ["city"] and get the fare.mean()
average_fare = pyber_df.groupby(["city"], as_index=False)["fare"].mean().rename(columns={
    "fare": "avg_fare"})
average_fare.head(5)
#average_fare.info()
#average_fare.decribe
#average_fare.dtypes
#print (average_fare)
#print(len(average_fare))   

Unnamed: 0,city,avg_fare
0,Alvarezhaven,23.92871
1,Alyssaberg,20.609615
2,Anitamouth,37.315556
3,Antoniomouth,23.625
4,Aprilchester,21.981579


In [14]:
## Total Number of Rides Per City is the x-axis
## group by ["city"] and get the ride_id.count()
total_rides = pyber_df.groupby(["city"], as_index=False)["ride_id"].count().rename(columns={
    "ride_id": "ride_count"})
total_rides.head(5)
#total_rides.info()
#total_rides.describe
#total_rides.dtypes
#print (total_rides) 
#print(len(total_rides))   

Unnamed: 0,city,ride_count
0,Alvarezhaven,31
1,Alyssaberg,26
2,Anitamouth,9
3,Antoniomouth,22
4,Aprilchester,19


In [15]:
# Total Number of Drivers Per City is correlates to bubble size

# Uses city_df
dcounts_df = city_df[["city","driver_count"]]
dcounts_df.head(5)
#dcounts_df.info()
#dcounts_df.dtypes
#dcounts_df.describe
#len(dcounts_df)

Unnamed: 0,city,driver_count
0,Alvarezhaven,21
1,Alyssaberg,67
2,Anitamouth,16
3,Antoniomouth,21
4,Aprilchester,49


In [16]:
# City Type (Urban, Suburban, Rural) is the color legend upper right

# Uses city_df, has 125 entries
city_type_df = city_df[["city","type"]] #.reset_index(drop=True)
city_type_df.head(5)
#city_type_df.info()
#city_type_df.describe
#city_type_df.dtypes
#len(city_type_df)

Unnamed: 0,city,type
0,Alvarezhaven,Urban
1,Alyssaberg,Urban
2,Anitamouth,Suburban
3,Antoniomouth,Urban
4,Aprilchester,Urban


In [17]:
# City Type (Urban, Suburban, Rural) for color legend
citytype = pyber_df["type"].unique()
citytype.sort()
print(citytype)

['Rural' 'Suburban' 'Urban']


In [18]:
# # from url: https://plot.ly/matplotlib/bubble-charts/
# # http://matplotlib.org/api/markers_api.html?highlight=marker
# #
# # Bubble Chart Relationships:
# # x axis is total rides, y axis is average fares 
# # colors are related to type (i.e. "Rural" = "gold")
# # driver count is related to bubble size = scatter_kws={"s": markersize_df}
# # Seaborn.implot was used to model the bubble chart.
# # http://seaborn.pydata.org/generated/seaborn.lmplot.html
# #--------------------------------------------------------------------------
# #
# # https://plot.ly/python/getting-started/
# # import plotly.plotly as py  # not installed

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
import pandas as pd
# #--------------------------------------------------------------------------
# all lists should have 125 elements
x_fare = average_fare["avg_fare"]
y_ride = total_rides["ride_count"]
#print("x_fare is ", x_fare)
#print("y_ride is ", y_ride)

# Set the default color palette
mycolors = ["skyblue", "coral", "gold"]
sns.set_palette(mycolors)
#sns.palplot(sns.color_palette())

# color is related to "city type" 
colors = {"Rural":"gold", "Suburban":"coral", "Urban":"skyblue"}
#print("dataframe colors is ", colors)
color_keys = colors.keys()
#print("dictionary color_keys)
ctype = city_type_df["type"].apply(lambda x: colors[x])
#ctype

# bubble size modification of dcount_df
markersize_df = 10000* dcounts_df["driver_count"] / sum(dcounts_df["driver_count"])
#markersize_df

# #--------------------------------------------------------------------------

b_df = pd.DataFrame(dict(x=x_fare, y=y_ride, color=ctype))
#b_df
sns.lmplot("x", "y", data=b_df, hue="color", fit_reg=False, legend_out=False,
           scatter_kws={"s": markersize_df})
sns.set_style("darkgrid")
sns.plt.title("Pyber Ride Sharing Data (2016)", weight='bold').set_fontsize('12')
sns.plt.xlabel("Total Number of Rides (Per City)", weight='bold').set_fontsize('12')
sns.plt.ylabel("Average Fare ($)", weight='bold').set_fontsize('12')

#sns.plt.legend(["City Type"])
plt.savefig("pyber_bubble_chart.png")
plt.show()

NameError: name 'dcnt' is not defined

In [None]:
#average_fare.shape, total_rides.shape

In [None]:
# example bubble chart for help
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
np.random.seed(sum(map(ord, "aesthetics")))
## https://plot.ly/python/getting-started/
#import plotly.plotly as py  # not installed

n = 50
x, y, z, s, ew = np.random.rand(5, n)

c, ec = np.random.rand(2, n, 4)

area_scale, width_scale = 500, 5

fig, ax = plt.subplots()

sc = ax.scatter(x, y, s=np.square(s)*area_scale, c=c, edgecolor=ec, 
                linewidth=ew*width_scale)
#ax.grid()
sns.set_style("darkgrid")

plt.savefig("bubble_scatter.png")
plt.show()


In [None]:
# City Type (Urban, Suburban, Rural) is the color legend upper right
# groupby or unique list of type called "city type"
citytype = pyber_df["type"].unique()
citytype.sort()
print(citytype)

In [None]:
# Create data for pie chart % of Total Fares by City-Type.

total_fares = pyber_df.groupby(["type"], as_index=False)["fare"].sum()
#print(total_fares)
#total_fares.info()
#total_fares.columns

# 2375 total rides
sum_fares = total_fares["fare"].sum()
#print(sum_fares)
#sum_fares.info()
#sum_fares.columns

# append percentages into a list
fares_pct = []
for i in range(0, len(total_fares)):
    #print( round(((total_fares.iloc[i].fare/sum_fares)* 100),2))
    fares_pct.append(round(((total_fares.iloc[i].fare/sum_fares)* 100),2))
#print(rides_pct)

In [None]:
# hw4_pie_chart1.py
#

# Dependencies
import matplotlib.pyplot as plt
import numpy as np
 
# https://matplotlib.org/2.0.0b4/examples/color/named_colors.html

fig = plt.figure()
ax = fig.add_subplot(111)

types = citytype
f_pct = fares_pct

colors = ["yellowgreen", "lightcoral", "lightskyblue"]
explode = (0, 0, 0.08)

x_axis = np.arange(0, len(types))

ax.set_title("Total Fares by City-Type", weight='bold')
ax.pie(f_pct, explode=explode, labels=types, colors=colors,
       autopct="%1.1f%%", textprops={'weight': 'bold'}, 
       shadow=True, startangle=120)
ax.axis("equal")

plt.tight_layout()
plt.savefig("fares_city-type%_pie_chart.png")
plt.show()

In [None]:
# Analysis of total Fares by City-Type

In [None]:
# Create data for pie chart % of Total Rides by City-Type.

total_rides = pyber_df.groupby(["type"], as_index=False)["ride_id"].count()
#print (total_rides)
#total_rides.info()
#total_rides.columns

# 2407 total rides
sum_rides = total_rides["ride_id"].sum()
#sum_rides
#sum_rides.info()
#sum_rides.columns

# append percentages into a list
rides_pct = []
for i in range(0, len(total_rides)):
    #print( round(((total_rides.iloc[i].ride_id/sum_rides)* 100),2))
    rides_pct.append(round(((total_rides.iloc[i].ride_id/sum_rides)* 100),2))
#print(rides_pct)

In [None]:
# hw4_pie_chart2.py
#

# Dependencies
import matplotlib.pyplot as plt
import numpy as np
 
# https://matplotlib.org/2.0.0b4/examples/color/named_colors.html

fig = plt.figure()
ax = fig.add_subplot(111)

types = citytype
r_pct = rides_pct

colors = ["yellowgreen", "lightcoral", "lightskyblue"]
explode = (0, 0, 0.08)

x_axis = np.arange(0, len(types))

ax.set_title("Total Rides by City-Type", weight='bold')
ax.pie(r_pct, explode=explode, labels=types, colors=colors,
       autopct="%1.1f%%", textprops={'weight': 'bold'}, 
       shadow=True, startangle=120)
ax.axis("equal")

plt.tight_layout()
plt.savefig("rides_city-type%_pie_chart.png")
plt.show()

In [None]:
# Analysis of total Rides by City-Type

In [None]:
# Create datat for pie chart % of Total Drivers by City, Type.

# Create new dataframe with only 3 columns--note the double brackets
summary_df = pyber_df[["city", "type", "driver_count"]]
#summary_df.head()

# Remove the duplicates from the dataframe to sum driver_count
unique_rows = summary_df.drop_duplicates()
#unique_rows

total_drivers = unique_rows.groupby(["type"], as_index=False)["driver_count"].sum()
#print(total_drivers)
#total_drivers.describe()
#total_drivers.columns

# Total of Drivers = 3349
sum_drivers = total_drivers["driver_count"].sum()
#print(sum_drivers)

# append percentages into a list
driver_pct = []
for i in range(0, len(total_drivers)):
    #print( round(((total_drivers.iloc[i].driver_count/sum_drivers)* 100),2))
    driver_pct.append(round(((total_drivers.iloc[i].driver_count/sum_drivers)* 100),2))
#print(driver_pct)

# Validation check "Rural" adds up to the sum above
#unique_rows.loc[unique_rows["type"]== "Rural"]

In [None]:
# hw4_pie_chart3.py
#

# Dependencies
import matplotlib.pyplot as plt
import numpy as np
 
# https://matplotlib.org/2.0.0b4/examples/color/named_colors.html

fig = plt.figure()
ax = fig.add_subplot(111)

types = citytype
d_pct = driver_pct

colors = ["yellowgreen", "lightcoral", "lightskyblue"]
explode = (0, 0, 0.08)

x_axis = np.arange(0, len(types))

ax.set_title("Total Drivers by City-Type", weight='bold')
ax.pie(d_pct, explode=explode, labels=types, colors=colors,
       autopct="%1.1f%%", textprops={'weight': 'bold'}, 
       shadow=True, startangle=120)
ax.axis("equal")

plt.tight_layout()
plt.savefig("drivers_city-type%_pie_chart.png")
plt.show()

In [None]:
# Analysis of total Drivers by City-Type

In [None]:
# Create a README.md export of my notebook.