In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

### Read the train and test data. Since train data is very large file, reading only first 100000 rows.

In [None]:
df_train=pd.read_csv('../input/train.csv',nrows=100000).reset_index()
df_test=pd.read_csv('../input/test.csv')

In [None]:
df_train.head(10)

In [None]:
df_train.describe()

### Clean the data

In [None]:
df_train.isnull().sum()

### Since the lattitude and longitude can not be zeros. Lets remove all the row containing zero latitude and longitude

In [None]:
df_train.columns

In [None]:
df_train.drop(df_train[df_train['pickup_longitude']==0].index,axis=0,inplace=True)

In [None]:
df_train.drop(df_train[df_train['pickup_latitude']==0].index,axis=0,inplace=True)
df_train.drop(df_train[df_train['dropoff_longitude']==0].index,axis=0,inplace=True)
df_train.drop(df_train[df_train['dropoff_latitude']==0].index,axis=0,inplace=True)
df_train.drop(df_train[df_train['passenger_count']==0].index,axis=0,inplace=True)

In [None]:
df_train.shape

### Let's find out the distance between pickup and dropoff in Kilometers and make one more column named distance (Km).

In [None]:
import math
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees).
    Source: http://gis.stackexchange.com/a/56589/15183
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    km = 6367 * c
    return km

In [None]:
df_train['distance (Km)']=df_train.apply(lambda x: haversine(x['pickup_longitude'],x['pickup_latitude'],x['dropoff_longitude'],x['dropoff_latitude']),axis=1)

In [None]:
x = df_train['distance (Km)']
data = [go.Histogram(x=x)]
layout = go.Layout(
    title='Distance (Km)',
    xaxis=dict(
        title='Distance'
    ),
    yaxis=dict(
        title='Count'
    ),
    bargap=0.2,
    bargroupgap=0.1
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic histogram')

In [None]:
df1=df_train
trace1 = go.Scattergl(
    x=df1['distance (Km)'],
    y = df1['fare_amount'],
    mode='markers',
    marker=dict(
        size=10,
        color = df1['passenger_count'], #set color equal to a variable
        colorscale='Viridis',
        showscale=True
    )
)
layout= go.Layout(
    title= 'Fare vs Distance',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Distance',
        ticklen= 5,
        zeroline= True,
        gridwidth= 1,
    ),
    yaxis=dict(
        title= 'Fare amount',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
)
fig= go.Figure(data=[trace1], layout=layout)
py.iplot(fig)

### Lets remove the row with distance > 1000 Kms as they are outliers as can be seen from above plot.

In [None]:
df_train.drop(df_train[df_train['distance (Km)']>1000].index,axis=0,inplace=True)
df_train.drop(df_train[df_train['distance (Km)']<0.001].index,axis=0,inplace=True)

In [None]:
df1=df_train
trace1 = go.Scattergl(
    x=df1['distance (Km)'],
    y = df1['fare_amount'],
    mode='markers',
    marker=dict(
        size=10,
        color = df1['passenger_count'], #set color equal to a variable
        colorscale='Viridis',
        showscale=True
    )
)
layout= go.Layout(
    title= 'Fare vs Distance',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Distance',
        ticklen= 5,
        zeroline= True,
        gridwidth= 1,
    ),
    yaxis=dict(
        title= 'Fare amount',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
)
fig= go.Figure(data=[trace1], layout=layout)
py.iplot(fig)

### Since fare depends on the distance as well as number of passangers, lets make two more columns with fare per distance and fare per distance per number of passangers.

In [None]:
df_train['fare per distance']=df_train['fare_amount']/df_train['distance (Km)']
df_train['fare per distance per passenger']=df_train['fare per distance']/df_train['passenger_count']

### Since the fare may depend on the peak time and festive months, lets make dates, months and year columns.

In [None]:
df_train['year']=pd.DatetimeIndex(df_train['pickup_datetime']).year
df_train['month']=pd.DatetimeIndex(df_train['pickup_datetime']).month
df_train['day']=pd.DatetimeIndex(df_train['pickup_datetime']).day
df_train['time']=pd.DatetimeIndex(df_train['pickup_datetime']).time

### Let's see if the fare depend on the particular area of New York city by plotting lattitude and longitude on the map.

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

df=df_train.iloc[:1000].reset_index()
df['text'] = df['fare per distance per passenger']
scl = [ [0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
    [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"] ]

data = [ dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df['pickup_longitude'],
        lat = df['pickup_latitude'],
        text = df['text'],
        mode = 'markers',
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = scl,
            cmin = 0,
            color = df['fare per distance per passenger'],
            cmax = 100,#df['fare per distance'].max(),
            colorbar=dict(
                title="Fare_amount"
            )
        ))]

layout = dict(
        title = 'Fare by location',
        colorbar = True,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-airports' )

### As can be seen from above map that there is no indication that the fare depends on particular region as the fares are mixed for all the area.

### Lets see if fare depends on number of passangers.

In [None]:
df_train.columns

In [None]:
import seaborn as sns
sns.barplot(x='passenger_count',y='fare_amount',data=df_train)

### As can be seen from above graph, the fare amount doesn't depend on the number of passengers.

In [None]:
sns.barplot(x='month',y='fare_amount',data=df_train)

### In some months, like september the fares are high