### Linear Regression with python ###

1. Load the libraries required

In [5]:

import io
import os
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns


2. Load the dataset

In [10]:
chicago_taxi_dataset = pd.read_csv("datasets/chicago_taxi_train.csv")
chicago_taxi_dataset.head(2)

Unnamed: 0,TRIP_START_TIMESTAMP,TRIP_END_TIMESTAMP,TRIP_START_HOUR,TRIP_SECONDS,TRIP_MILES,TRIP_SPEED,PICKUP_CENSUS_TRACT,DROPOFF_CENSUS_TRACT,PICKUP_COMMUNITY_AREA,DROPOFF_COMMUNITY_AREA,FARE,TIPS,TIP_RATE,TOLLS,EXTRAS,TRIP_TOTAL,PAYMENT_TYPE,COMPANY
0,05/17/2022 7:15:00 AM,05/17/2022 7:45:00 AM,7.25,2341,2.57,4.0,,,,17.0,31.99,2.0,6.3,0.0,0.0,33.99,Mobile,Flash Cab
1,05/17/2022 5:15:00 PM,05/17/2022 5:30:00 PM,17.25,1074,1.18,4.0,,17031080000.0,,8.0,9.75,3.0,27.9,0.0,1.0,14.25,Credit Card,Flash Cab


In [19]:
training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]
print('rows: '+str(len(training_df.index)))
training_df.head(2)


rows: 31694


Unnamed: 0,TRIP_MILES,TRIP_SECONDS,FARE,COMPANY,PAYMENT_TYPE,TIP_RATE
0,2.57,2341,31.99,Flash Cab,Mobile,6.3
1,1.18,1074,9.75,Flash Cab,Credit Card,27.9


3. Let's inspect the dataset. 
  * What is the maximum fare?
  * What is the mean distance across all trips?
  * How many cab companies are in the dataset?
  * What is the most frequent payment type?
  * Are any features missing data?

In [33]:
maximum_fare = training_df['FARE'].max()
print("Maximum fare is ${fare:.2f}".format(fare = maximum_fare))
mean_distance = training_df['TRIP_MILES'].mean()
print("Mean distance across all trips is {distance:.2f}miles".format(distance = mean_distance))
unique_cab_companies = training_df['COMPANY'].nunique()
print("The total cab companies in the dataset is {number_of_companies}".format(number_of_companies = unique_cab_companies))
most_frequent_payment_type = training_df['PAYMENT_TYPE'].value_counts().idxmax()
print(f"Most frequent payment type is {most_frequent_payment_type}")

missing_values = training_df.isnull().sum().sum()
print("Are there any features missing data? -> ", "No" if missing_values == 0 else "Yes")

training_df.describe(include='all')

Maximum fare is $159.25
Mean distance across all trips is 8.29miles
The total cab companies in the dataset is 31
Most frequent payment type is Credit Card
Are there any features missing data? ->  No


Unnamed: 0,TRIP_MILES,TRIP_SECONDS,FARE,COMPANY,PAYMENT_TYPE,TIP_RATE
count,31694.0,31694.0,31694.0,31694,31694,31694.0
unique,,,,31,7,
top,,,,Flash Cab,Credit Card,
freq,,,,7887,14142,
mean,8.289463,1319.796397,23.90521,,,12.965785
std,7.265672,928.932873,16.970022,,,15.517765
min,0.5,60.0,3.25,,,0.0
25%,1.72,548.0,9.0,,,0.0
50%,5.92,1081.0,18.75,,,12.2
75%,14.5,1888.0,38.75,,,20.8
