In [1]:
# Import dependencies
import pandas as pd
import numpy as np # linear algebra

import matplotlib.pyplot as plt
from builtins import list
import matplotlib
matplotlib.style.use('ggplot')
import scipy.stats as st
import datetime
from sklearn import datasets
from scipy.stats import linregress

%matplotlib inline

In [2]:
uber_csv = "Resources/uber-rides-dataset.csv"

In [3]:
uber_df = pd.read_csv(uber_csv)

FileNotFoundError: [Errno 2] File b'Resources/uber-rides-dataset.csv' does not exist: b'Resources/uber-rides-dataset.csv'

In [None]:
uber_df.head()

In [None]:
uber_df.columns

In [None]:
# Select the Columns we need
distance_type = uber_df.loc[:, ["distance_kms", "price_usd"]]

In [None]:
# Create the bins in which the distances will be held
bins = [0, 5, 10, 15, 20, 47]
#Create the names for the 5 bins
distance_names = ["Very Short Trip", "Short Trip", "Medium Trip", "Long trip", "Very Long Trip"]

In [None]:
# Assign the values to the bins
distance_type["Type of Distance"] = pd.cut(distance_type["distance_kms"], bins, labels=distance_names)
distance_type.head()

In [None]:
# Calculate average price per distance bin
avg_price = distance_type.groupby(['Type of Distance'])["price_usd"].mean()
avg_price

In [None]:
# Calculate the count per distance bin
count = distance_type.groupby(['Type of Distance']).count()
count

In [None]:
# Columns into arrays
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#8c564b"]
explode = (0, 0.1, 0, 0, 0)
plt.figure(figsize=(10,6))
plt.pie(count.distance_kms, labels=distance_names, explode=explode, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Distribution of distances")
plt.savefig('counttypesoftrip.png')
plt.show()

The above pie chart shows that majority of UBER trips done by the Russian customer is of short distance (5-9Km)

In [None]:
# Plot number of trips for each weather description
x = uber_df['weather_desc'].value_counts().plot(kind='bar', figsize = (10,6))
plt.tight_layout()
plt.savefig('weathercount.png')

The above bar chart shows the trip count for each type of weather. We can see that majority of trips were taken during mostly cloudy days.

In [None]:
# Let's see if there is a correlation between temperature value and distance in kms. 
x_values = uber_df['temperature_value']
y_values = uber_df['distance_kms']
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="black")
plt.xlabel('Temperature')
plt.ylabel('Distance(kms)')
print(f"The r-squared is: {rvalue}")
plt.savefig('tempdistancecorr.png')
plt.show()

Based on the above graph there is no correlation between the temperature value and the distance in kms of Uber trips
since the correlation is only 0.08, very close to 0. The r-squared is also near 0 which means that only 7.8% of the
variance for the Distance (kms) can be explained by the temperature.

In [None]:
# sorting the sata by weather_desc
uber_df = uber_df.sort_values(by='weather_desc', ascending=True)

In [None]:
# Value counts for weather description
weather_count = uber_df.groupby(['weather_desc']).size()
weather_count

In [None]:
avg_temp = uber_df.groupby(['weather_desc'])["temperature_value"].mean()
avg_temp

In [None]:
# let's then calculate the correlation between the trip frequency and the average temperature value
x_values = avg_temp
y_values = weather_count
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="black")
plt.xlabel('Average temperature/type of weather')
plt.ylabel('Number of trips/type of weather')
print(f"The r-squared is: {rvalue}")
plt.savefig('avgtempweathercorr.png')
plt.show()

Based on the above graph there is no correlation between the trip frequency and the average temperature value
since the correlation is only -0.05, very close to 0. The r-squared is also near 0 which means that only 5% of the
variance for the trip frequency can be explained by the temperature.

In [None]:
# Let's see if there is a correlation between distance and pricing. 
x_values = uber_df['distance_kms']
y_values = uber_df['price_usd']
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="black")
plt.xlabel('Distance (kms)')
plt.ylabel('Trip Price')
print(f"The r-squared is: {rvalue}")
plt.savefig('distanceprice.png')
plt.show()

Based on the above scatter plot there is a moderate correlation between the distance in kms and the trip fare since the correlation number is 0.73, closer to 1. The r-squared is also near 1 which means that 72.6% of the
variance for the trip fare can be explained by the distance in kms. The linear equation shows a positive linear regression line between the 2 variables, which means that an increase in the distance in kms corresponds to an increase in the trip fare.

In [None]:
# Create pie plot for the vehicle make model
vehicle_count = pd.DataFrame(uber_df.groupby(["vehicle_make_model"]).count()).reset_index()
vehicle_count.head()

In [None]:
# Filter the DataFrame down only to those columns to chart
vehicle_count = vehicle_count[["vehicle_make_model","driver_uid"]]
vehicle_count = vehicle_count.rename(columns={"driver_uid": "Count"})
vehicle_count.head()

In [None]:
# Sort data in descending order
vehicle_count = vehicle_count.sort_values('Count', ascending=False)


In [None]:
# the top 5
top_five = vehicle_count[:5].copy()
top_five

In [None]:
# Other vehicles
other_vehicles = pd.DataFrame(data = {
    'vehicle_make_model' : ['others'],
    'Count' : [vehicle_count['Count'][5:].sum()]
})
other_vehicles

In [None]:
# Combining top 5 with others
top_five_others = pd.concat([top_five, other_vehicles])
top_five_others

In [None]:
#plotting the 5 top vehicles with the others
plt.figure(figsize=(14,10))
ax1 = plt.subplot(121, aspect='equal')
top_five_others.plot(kind='pie', y = "Count", ax=ax1, autopct='%1.1f%%', 
 startangle=200, shadow=False, labels=top_five_others['vehicle_make_model'], legend = False, fontsize=16)
plt.savefig('vehicletop5.png')

The above pie chart shows the top 5 vehicle make models used by UBER drivers in Russia. 
The 1st one is the Hyunday Solaris, a south Korean car, which is sold as Hyundai Accent in the USA. For the Russian market it is assembled by the TagAZ plant in Taganrog.
The 2nd one is the Volkswagen Polo, a car produced by the German manufacturer Volkswagen since 1975.
The 3rd one is Kia Rio, another South Korean car.

In [None]:
# Set Index to vehicle make model
top_five = top_five.set_index("vehicle_make_model")


In [None]:
# Plot a bar chart with the same information
top_five.plot(kind="bar", figsize=(10,3))

# Set a title for the chart
plt.title("Count per Top 5 Vehicle Make Model")

plt.tight_layout()
plt.savefig('vehiclecount.png')
plt.show()

In [None]:
# Select the columns needed
prices_box = uber_df.loc[:, ["city", "price_usd", "distance_kms"]]
prices_box["price/km"] = prices_box["price_usd"]/prices_box["distance_kms"]
prices_box.head()

In [None]:
# Sort data by city
df = prices_box.sort_values(['city','price/km'],ascending=True).groupby('city').head().reset_index()

In [None]:
df.describe

In [None]:
# Create a boxplot to compare prices in the 3 Russian cities
df.boxplot("price/km", by="city", figsize=(20, 10))
plt.savefig('boxplots.png')

The 3 above boxplots show 3 different distributions of prices/km in the 3 Russian cities.
The box plot representing the distribution of prices/km in Saint Petersburg is very short comparted to the other 2, which shows less variability in the data while the boxplot representing prices/km in Moscow shows the highest variability.
While the Moscow boxplot shows a symmetric/normal distribution as the median falls in the middle of the Interquartile range, the other 2 boxplots show a very skewed distribution: the Ekaterinburg boxplot is skewed to the right and the Saint Peterburg is skewed to the left.
The length of the whiskers indicate data's minimum and maximum.
The Saint Petersburg boxplot shows a low outlier.

In [None]:
# Extract individual groups
group1 = prices_box[prices_box["city"] == "Saint Petersburg"]["price_usd"]
group2 = prices_box[prices_box["city"] == "Moscow"]["price_usd"]
group3 = prices_box[prices_box["city"] == "Ekaterinburg"]["price_usd"]

In [None]:

import scipy.stats as stats
# Perform the ANOVA
stats.f_oneway(group1, group2, group3)

Since the p-value is less than 0.05, there is a significant difference between the mean prices of UBER rides in the 3 Russian cities.

In [None]:
# Calculate IQR of prices in Saint Petersburg
quartiles = group1.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of prices is: {lowerq}")
print(f"The upper quartile of prices is: {upperq}")
print(f"The interquartile range of prices is: {iqr}")
print(f"The the median of prices is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Calculate IQR of prices in Moscow
quartiles = group2.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of prices is: {lowerq}")
print(f"The upper quartile of prices is: {upperq}")
print(f"The interquartile range of prices is: {iqr}")
print(f"The the median of prices is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Calculate IQR of prices in Ekaterinburg
quartiles = group3.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of prices is: {lowerq}")
print(f"The upper quartile of prices is: {upperq}")
print(f"The interquartile range of prices is: {iqr}")
print(f"The the median of prices is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

# DRIVER GENDER 'driver_gender'

In [None]:
# Calculate the number of male and female drivers
gender_count = uber_df["driver_gender"].value_counts()

# Calculate the percentage of male and female drivers
percent_count = round((gender_count)/len(uber_df['driver_gender'])*100,2)

# Create a summary table for genders
gender_table = pd.DataFrame({"Gender Count": gender_count, "Gender Percentage":percent_count})
gender_table

In [None]:
# Create a pie chart for genders
labels = 'Male', 'Female'
sizes = percent_count
colors = ["lightblue", "red"]

plt.pie(sizes, labels=labels, colors=colors, shadow=True, autopct='%1.1f%%', startangle=140)
plt.title("Percentage of Male and Female drivers")

#plt.savefig("../Images/percent_gender_pie_chart.png")
plt.show()

# DRIVER FIRST NAME 'driver_name_en'

In [None]:
# Find drivers first name
drivers_names = pd.DataFrame(uber_df['driver_name_en'])

# Show duplicates first names
drivers_names = drivers_names.groupby(drivers_names.columns.tolist()).size()
names_df = pd.DataFrame(drivers_names).reset_index()

first_name = names_df['driver_name_en']
name_count = names_df[0]

# Show table with all the first names
names_table = pd.DataFrame({"First Name": first_name, "Name Count": name_count})
names_table.head()

In [None]:
# Create a table with the 15 most popular first names among Uber drivers 
top_names = names_table.nlargest(15,['Name Count']) 
top_names.head()

In [None]:
# Create a bar chart with the 15 most popular drivers first name
first_names1 = top_names['First Name']
total_names1 = np.arange(len(first_names1))
popularity1 = top_names['Name Count']

plt.bar(total_names1, popularity1, align='center', alpha=0.5, color='green')
plt.xticks(total_names1, first_names1)
plt.ylabel('Popularity')
plt.xlabel('Drivers First Name')
plt.title("Most popular drivers' first name")
plt.xticks(rotation='vertical')

plt.show()
#plt.savefig("../Images/popular_names_bar_chart.png")

# WAIT TIME 'wait_time'

In [None]:
# Show wait time and convert to time format 
wait_df = uber_df['wait_time']
wait_df = pd.to_datetime(uber_df['wait_time'])
pd.DataFrame(wait_df).head()

In [None]:
# Find all the trips dates 
start_df = uber_df['trip_start_time']
start_df = pd.to_datetime(start_df).reset_index()
start_df

trip_number = start_df['index']
trip_date = start_df['trip_start_time']

# Show table with all the trips and dates
trip_dates_df = pd.DataFrame({"Trip Number": trip_number, "Trip Date": trip_date})
trip_dates_df.head()

In [None]:
# Show the year and month for each trip 
trip_dates_df['Year'] = trip_dates_df['Trip Date'].apply(lambda line:  str(line.year))
trip_dates_df['Month'] = trip_dates_df['Trip Date'].apply(lambda line:  str(line.month))

new_trip_table = pd.DataFrame({"Trip Number": trip_number, 
                               "Trip Date": trip_date, 
                               "Month": (trip_dates_df['Month']), 
                               "Year": (trip_dates_df['Year']), 
                               "Wait Time by Minutes": (wait_df.dt.minute)})

new_trip_table.head()


In [None]:
# Find the average wait time per month
wait_df_mean = new_trip_table.groupby(['Year','Month'])["Wait Time by Minutes"].mean()
pd.DataFrame(wait_df_mean).head()

In [None]:
# Find the maximum wait time per month
wait_df_by_max = new_trip_table.groupby(['Year','Month'])["Wait Time by Minutes"].max()
pd.DataFrame(wait_df_by_max).head()

In [None]:
# Find the minimum wait time per month
wait_df_by_min = new_trip_table.groupby(['Year','Month'])["Wait Time by Minutes"].min()
pd.DataFrame(wait_df_by_min).head()

In [None]:
all_wait_df = pd.DataFrame({"Maximum Wait": (wait_df_by_max), 
                          "Average Wait": (wait_df_mean), 
                            "Minimum Wait": (wait_df_by_min)})
pd.DataFrame(all_wait_df).head()

In [None]:
# Create a line graph showing the average, minimum and maximum wait time for each month
all_wait_df.plot(style='-', color =('tab:red', 'orange', 'tab:blue'))

plt.xlabel("Years (by month)")
plt.ylabel("Wait Time (in minutes)")
plt.legend(loc="best")
plt.title("Evolution of Waiting Time over the Years")
plt.grid(axis='y')
plt.xticks(rotation=45)

plt.show()
#plt.savefig("../Images/wait_time_line_chart.png")