# Pyber Data Analysis

## This Jupyter Notebook will document the steps to create a bubble plot that showcases the relationship between four key variables:

* Average Fare ($) Per City
* Total Number of Rides Per City
* Total Number of Drivers Per City
* City Type (Urban, Suburban, Rural)

## Data Analysis
* TBD
* TBD
* TBD

In [1]:
%matplotlib notebook

In [2]:
# import dependencies
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import linregress

In [3]:
# create path to datasets and then read them into pandas dataframes
city_data_csv = "Resources/city_data.csv"
ride_data_csv = "Resources/ride_data.csv"

city_data_df = pd.read_csv(city_data_csv)
ride_data_df = pd.read_csv(ride_data_csv)

len(ride_data_df)

2375

In [4]:
# first check to see if there are any outliers in the city data

# Use numpy to create quartiles
city_lower_quartile = np.percentile(city_data_df["driver_count"], 25, axis=0)
city_middle_quartile = np.percentile(city_data_df["driver_count"], 50, axis=0)
city_upper_quartile = np.percentile(city_data_df["driver_count"], 75, axis=0)
# Calculate the interquartile range
city_interquartile_range = city_upper_quartile - city_lower_quartile
# Find the upper boundary
city_upper_boundary = city_middle_quartile + (1.5 * city_interquartile_range)
# Find the lower boundary
city_lower_boundary = city_middle_quartile - (1.5 * city_interquartile_range)
# Check for any outliers in city data and note their position in the dataframe
city_counter = 0
city_outlier_arr = []
for element in city_data_df["driver_count"]:
    if element < city_lower_boundary:
        city_outlier_arr.append(city_counter)
    elif element > city_upper_boundary:
        city_outlier_arr.append(city_counter)
    city_counter += 1

# remove the outliers from the city data dataframe
city_data_df.drop(city_data_df.index[city_outlier_arr], inplace=True)

In [7]:
# similarly, check to see if there are any outliers in the ride data

# Use numpy to create quartiles
ride_lower_quartile = np.percentile(ride_data_df["fare"], 25, axis=0)
ride_middle_quartile = np.percentile(ride_data_df["fare"], 50, axis=0)
ride_upper_quartile = np.percentile(ride_data_df["fare"], 75, axis=0)
# Calculate the interquartile range
ride_interquartile_range = ride_upper_quartile - ride_lower_quartile
# Find the upper boundary
ride_upper_boundary = ride_middle_quartile + (1.5 * ride_interquartile_range)
# Find the lower boundary
ride_lower_boundary = ride_middle_quartile - (1.5 * ride_interquartile_range)
# Check for any outliers in ride data and note their position in the dataframe
ride_counter = 0
ride_outlier_arr = []
for element in ride_data_df["fare"]:
    if element < ride_lower_boundary:
        ride_outlier_arr.append(ride_counter)
    elif element > ride_upper_boundary:
        ride_outlier_arr.append(ride_counter)
    ride_counter += 1

# remove the outliers from the ride data dataframe
ride_data_df.drop(ride_data_df.index[ride_outlier_arr], inplace=True)

In [34]:
# create summary dataframe of ride data that shows total number of 
# rides per city and average fare per ride in the city

# get count of rides per city
count_rides = ride_data_df["city"].value_counts()
count_rides_df = pd.DataFrame({
    "Count of Rides": count_rides
})
# get average fare per city
average_fare = ride_data_df.groupby(["city"])
average_fare = average_fare.sum() / average_fare.count()
average_fare = average_fare.loc[:, ["fare"]]
average_fare = average_fare.rename(columns={"fare":"Average Ride Fare"})

# place results into a data frame as a summary of the fare data
ride_summary_df = count_rides_df.join(average_fare, how='outer')
ride_summary_df.reset_index(inplace=True)
ride_summary_df = ride_summary_df.rename(columns={"index":"city"})

# merge the ride summary dataframe into the city data dataframe
city_merge_df = pd.merge(city_data_df, ride_summary_df, on="city", how="outer")
city_merge_df.head()

Unnamed: 0,city,driver_count,type,Count of Rides,Average Ride Fare
0,Richardfort,38.0,Urban,28,22.373214
1,Williamsstad,59.0,Urban,23,24.362174
2,Port Angela,67.0,Urban,19,23.836842
3,Rodneyfort,34.0,Urban,23,28.616957
4,West Robert,39.0,Urban,31,25.123871


In [9]:
city_data_df.head()

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban


In [22]:
ride_data_df.head()

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2018-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2018-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2018-02-10 23:22:03,23.44,5149245426178
4,South Jack,2018-03-06 04:28:35,34.58,3908451377344
