## Datathon Grupo 24




In [1]:
# Load packages
import os
import pandas as pd
import numpy as np
# This line is needed to display plots inline in Jupyter Notebook
%matplotlib inline

# Required for basic python plotting functionality
import matplotlib.pyplot as plt

# Required for formatting dates later in the case
import datetime
import matplotlib.dates as mdates

# Required to display image inline
from IPython.display import Image

# Advanced plotting functionality with seaborn
import seaborn as sns
sns.set(style="whitegrid") # can set style depending on how you'd like it to look

## Yellow_trips 
Trip data (pickup/dropoff times, pickup/dropoff locations) from NYC yellow medallion taxis. Note: in order to keep the dataset size manageable, the provided data is a 5% unbiased sample of the raw data. If using trip count metrics, remember to multiply quantities by 20 to approximate the actual data. ~8 million rows & 9 columns. Size: ~260MB zipped, ~800MB unzipped. 


In [2]:
#GET DATA
YT = pd.read_csv('Dataset\\yellow_trips.csv.gz')
YT.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,total_amount
0,2014-04-03 18:28:10,2014-04-03 18:54:32,-74.006033,40.706284,-73.918837,40.744946,1,8.7,34.8
1,2014-04-16 15:42:00,2014-04-16 16:07:00,-73.979558,40.749357,0.0,0.0,1,9.37,30.0
2,2014-04-13 18:04:00,2014-04-13 18:07:00,-73.956453,40.775307,-73.954792,40.784992,1,0.89,6.5
3,2014-05-21 19:33:00,2014-05-21 19:45:00,-73.987212,40.75785,-73.960198,40.775472,1,2.4,12.5
4,2014-05-30 16:28:00,2014-05-30 16:51:00,-73.974292,40.755397,-74.011867,40.704222,3,5.58,24.5


String to datetime **pickup_datetime** y **dropoff_datetime**

In [3]:
# string to datetime
YT['pickup_datetime'] = YT['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
YT['dropoff_datetime'] = YT['dropoff_datetime'].apply(lambda x: datetime.datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))

### Data Cleaning

##### Null validation

In [4]:
YT.isnull().sum()

pickup_datetime      0
dropoff_datetime     0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
trip_distance        0
total_amount         0
dtype: int64

##### Coord. **pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude**

##### Coord. outside NYC

In [5]:
#Define coordinates with geographic data
geo = pd.read_csv('Dataset\\geographic.csv')
LatMax =geo[geo > 0].max().max()
LatMin =geo[geo > 0].min().min()
LonMax =geo[geo < 0].max().max()
LonMin =geo[geo < 0].min().min()

YT['pickup_longitude'] = YT['pickup_longitude'].apply(lambda x: np.nan if x >= LonMax or x <= LonMin  else x)
YT['dropoff_longitude'] = YT['dropoff_longitude'].apply(lambda x: np.nan if x >= LonMax or x <= LonMin  else x)
YT['pickup_latitude'] = YT['pickup_latitude'].apply(lambda x: np.nan if x >= LatMax or x <= LatMin else x)
YT['dropoff_latitude'] = YT['dropoff_latitude'].apply(lambda x: np.nan if x >= LatMax or x <= LatMin else x)

In [6]:
YTC= YT.dropna()

In [7]:
YTC.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,total_amount
0,2014-04-03 18:28:10,2014-04-03 18:54:32,-74.006033,40.706284,-73.918837,40.744946,1,8.7,34.8
2,2014-04-13 18:04:00,2014-04-13 18:07:00,-73.956453,40.775307,-73.954792,40.784992,1,0.89,6.5
3,2014-05-21 19:33:00,2014-05-21 19:45:00,-73.987212,40.75785,-73.960198,40.775472,1,2.4,12.5
4,2014-05-30 16:28:00,2014-05-30 16:51:00,-73.974292,40.755397,-74.011867,40.704222,3,5.58,24.5
5,2014-05-11 13:09:24,2014-05-11 13:19:18,-73.978131,40.748238,-73.984475,40.749268,1,1.3,11.05


 
* **trip_distance**

   Distancias fuera de rango - null

In [8]:
YTC['total_amount'] = YTC.apply(lambda row: row['total_amount'] if row['total_amount'] > 0 else np.nan, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
YTC['trip_distance'] = YTC.apply(lambda row: row['trip_distance'] if (row['trip_distance'] > 0 and row['total_amount'] > 0) else np.nan, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
YTC['passenger_count'] = YTC.apply(lambda row: row['passenger_count'] if row['passenger_count'] > 0 else np.nan, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
